@agentv/core 2.10.0 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-7Q4PH265.js";
20
+ } from "./chunk-REN5PS7B.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -414,9 +414,14 @@ async function loadConfig(evalFilePath, repoRoot) {
414
414
  logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
415
415
  continue;
416
416
  }
417
+ const executionDefaults = parseExecutionDefaults(
418
+ parsed.execution,
419
+ configPath
420
+ );
417
421
  return {
418
422
  guideline_patterns: guidelinePatterns,
419
- eval_patterns: evalPatterns
423
+ eval_patterns: evalPatterns,
424
+ execution: executionDefaults
420
425
  };
421
426
  } catch (error) {
422
427
  logWarning(
@@ -557,6 +562,36 @@ function extractTotalBudgetUsd(suite) {
557
562
  );
558
563
  return void 0;
559
564
  }
565
+ function parseExecutionDefaults(raw, configPath) {
566
+ if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
567
+ return void 0;
568
+ }
569
+ const obj = raw;
570
+ const result = {};
571
+ if (typeof obj.verbose === "boolean") {
572
+ result.verbose = obj.verbose;
573
+ } else if (obj.verbose !== void 0) {
574
+ logWarning(`Invalid execution.verbose in ${configPath}, expected boolean`);
575
+ }
576
+ const traceFile = obj.trace_file;
577
+ if (typeof traceFile === "string" && traceFile.trim().length > 0) {
578
+ result.trace_file = traceFile.trim();
579
+ } else if (traceFile !== void 0) {
580
+ logWarning(`Invalid execution.trace_file in ${configPath}, expected non-empty string`);
581
+ }
582
+ if (typeof obj.keep_workspaces === "boolean") {
583
+ result.keep_workspaces = obj.keep_workspaces;
584
+ } else if (obj.keep_workspaces !== void 0) {
585
+ logWarning(`Invalid execution.keep_workspaces in ${configPath}, expected boolean`);
586
+ }
587
+ const otelFile = obj.otel_file;
588
+ if (typeof otelFile === "string" && otelFile.trim().length > 0) {
589
+ result.otel_file = otelFile.trim();
590
+ } else if (otelFile !== void 0) {
591
+ logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
592
+ }
593
+ return Object.keys(result).length > 0 ? result : void 0;
594
+ }
560
595
  function logWarning(message) {
561
596
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
562
597
  }
@@ -1285,18 +1320,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1285
1320
  });
1286
1321
  continue;
1287
1322
  }
1323
+ if (typeValue === "contains_any" || typeValue === "contains_all") {
1324
+ const value = asStringArrayStrict(rawEvaluator.value);
1325
+ if (!value || value.length === 0) {
1326
+ logWarning2(
1327
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
1328
+ );
1329
+ continue;
1330
+ }
1331
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1332
+ const required2 = parseRequired(rawEvaluator.required);
1333
+ evaluators.push({
1334
+ name,
1335
+ type: typeValue,
1336
+ value,
1337
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1338
+ ...required2 !== void 0 ? { required: required2 } : {},
1339
+ ...negate !== void 0 ? { negate } : {}
1340
+ });
1341
+ continue;
1342
+ }
1343
+ if (typeValue === "icontains") {
1344
+ const value = asString(rawEvaluator.value);
1345
+ if (!value) {
1346
+ logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
1347
+ continue;
1348
+ }
1349
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1350
+ const required2 = parseRequired(rawEvaluator.required);
1351
+ evaluators.push({
1352
+ name,
1353
+ type: "icontains",
1354
+ value,
1355
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1356
+ ...required2 !== void 0 ? { required: required2 } : {},
1357
+ ...negate !== void 0 ? { negate } : {}
1358
+ });
1359
+ continue;
1360
+ }
1361
+ if (typeValue === "icontains_any" || typeValue === "icontains_all") {
1362
+ const value = asStringArrayStrict(rawEvaluator.value);
1363
+ if (!value || value.length === 0) {
1364
+ logWarning2(
1365
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
1366
+ );
1367
+ continue;
1368
+ }
1369
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1370
+ const required2 = parseRequired(rawEvaluator.required);
1371
+ evaluators.push({
1372
+ name,
1373
+ type: typeValue,
1374
+ value,
1375
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1376
+ ...required2 !== void 0 ? { required: required2 } : {},
1377
+ ...negate !== void 0 ? { negate } : {}
1378
+ });
1379
+ continue;
1380
+ }
1381
+ if (typeValue === "starts_with" || typeValue === "ends_with") {
1382
+ const value = asString(rawEvaluator.value);
1383
+ if (!value) {
1384
+ logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
1385
+ continue;
1386
+ }
1387
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1388
+ const required2 = parseRequired(rawEvaluator.required);
1389
+ evaluators.push({
1390
+ name,
1391
+ type: typeValue,
1392
+ value,
1393
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1394
+ ...required2 !== void 0 ? { required: required2 } : {},
1395
+ ...negate !== void 0 ? { negate } : {}
1396
+ });
1397
+ continue;
1398
+ }
1288
1399
  if (typeValue === "regex") {
1289
1400
  const value = asString(rawEvaluator.value);
1290
1401
  if (!value) {
1291
1402
  logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
1292
1403
  continue;
1293
1404
  }
1405
+ const flags = asString(rawEvaluator.flags);
1294
1406
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1295
1407
  const required2 = parseRequired(rawEvaluator.required);
1296
1408
  evaluators.push({
1297
1409
  name,
1298
1410
  type: "regex",
1299
1411
  value,
1412
+ ...flags !== void 0 ? { flags } : {},
1300
1413
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1301
1414
  ...required2 !== void 0 ? { required: required2 } : {},
1302
1415
  ...negate !== void 0 ? { negate } : {}
@@ -1469,15 +1582,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1469
1582
  }
1470
1583
  return evaluators.length > 0 ? evaluators : void 0;
1471
1584
  }
1472
- var ASSERTION_TYPES = /* @__PURE__ */ new Set(["contains", "regex", "is_json", "equals", "rubrics"]);
1585
+ var ASSERTION_TYPES = /* @__PURE__ */ new Set([
1586
+ "contains",
1587
+ "contains_any",
1588
+ "contains_all",
1589
+ "icontains",
1590
+ "icontains_any",
1591
+ "icontains_all",
1592
+ "starts_with",
1593
+ "ends_with",
1594
+ "regex",
1595
+ "is_json",
1596
+ "equals",
1597
+ "rubrics"
1598
+ ]);
1473
1599
  function generateAssertionName(typeValue, rawEvaluator) {
1474
1600
  if (!ASSERTION_TYPES.has(typeValue)) {
1475
1601
  return void 0;
1476
1602
  }
1477
1603
  const value = asString(rawEvaluator.value);
1604
+ const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
1478
1605
  switch (typeValue) {
1479
1606
  case "contains":
1480
1607
  return value ? `contains-${value}` : "contains";
1608
+ case "contains_any":
1609
+ return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
1610
+ case "contains_all":
1611
+ return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
1612
+ case "icontains":
1613
+ return value ? `icontains-${value}` : "icontains";
1614
+ case "icontains_any":
1615
+ return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
1616
+ case "icontains_all":
1617
+ return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
1618
+ case "starts_with":
1619
+ return value ? `starts_with-${value}` : "starts_with";
1620
+ case "ends_with":
1621
+ return value ? `ends_with-${value}` : "ends_with";
1481
1622
  case "regex":
1482
1623
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
1483
1624
  case "is_json":
@@ -1503,6 +1644,13 @@ function coerceEvaluator(candidate, contextId) {
1503
1644
  function asString(value) {
1504
1645
  return typeof value === "string" ? value : void 0;
1505
1646
  }
1647
+ function asStringArrayStrict(value) {
1648
+ if (!Array.isArray(value)) {
1649
+ return void 0;
1650
+ }
1651
+ const result = value.filter((v) => typeof v === "string");
1652
+ return result.length > 0 ? result : void 0;
1653
+ }
1506
1654
  function asStringArray(value, description) {
1507
1655
  if (value === void 0) {
1508
1656
  return void 0;
@@ -2820,6 +2968,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
2820
2968
  }
2821
2969
  return cwd ? { ...config, cwd } : config;
2822
2970
  }
2971
+ function parseRepoSource(raw) {
2972
+ if (!isJsonObject(raw)) return void 0;
2973
+ const obj = raw;
2974
+ if (obj.type === "git" && typeof obj.url === "string") {
2975
+ return { type: "git", url: obj.url };
2976
+ }
2977
+ if (obj.type === "local" && typeof obj.path === "string") {
2978
+ return { type: "local", path: obj.path };
2979
+ }
2980
+ return void 0;
2981
+ }
2982
+ function parseRepoCheckout(raw) {
2983
+ if (!isJsonObject(raw)) return void 0;
2984
+ const obj = raw;
2985
+ const ref = typeof obj.ref === "string" ? obj.ref : void 0;
2986
+ const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
2987
+ const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
2988
+ if (!ref && !resolve && ancestor === void 0) return void 0;
2989
+ return {
2990
+ ...ref !== void 0 && { ref },
2991
+ ...resolve !== void 0 && { resolve },
2992
+ ...ancestor !== void 0 && { ancestor }
2993
+ };
2994
+ }
2995
+ function parseRepoClone(raw) {
2996
+ if (!isJsonObject(raw)) return void 0;
2997
+ const obj = raw;
2998
+ const depth = typeof obj.depth === "number" ? obj.depth : void 0;
2999
+ const filter = typeof obj.filter === "string" ? obj.filter : void 0;
3000
+ const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
3001
+ if (depth === void 0 && !filter && !sparse) return void 0;
3002
+ return {
3003
+ ...depth !== void 0 && { depth },
3004
+ ...filter !== void 0 && { filter },
3005
+ ...sparse !== void 0 && { sparse }
3006
+ };
3007
+ }
3008
+ function parseRepoConfig(raw) {
3009
+ if (!isJsonObject(raw)) return void 0;
3010
+ const obj = raw;
3011
+ const repoPath = typeof obj.path === "string" ? obj.path : void 0;
3012
+ const source = parseRepoSource(obj.source);
3013
+ if (!repoPath || !source) return void 0;
3014
+ const checkout = parseRepoCheckout(obj.checkout);
3015
+ const clone = parseRepoClone(obj.clone);
3016
+ return {
3017
+ path: repoPath,
3018
+ source,
3019
+ ...checkout !== void 0 && { checkout },
3020
+ ...clone !== void 0 && { clone }
3021
+ };
3022
+ }
3023
+ function parseResetConfig(raw) {
3024
+ if (!isJsonObject(raw)) return void 0;
3025
+ const obj = raw;
3026
+ const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
3027
+ const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
3028
+ if (!strategy && afterEach === void 0) return void 0;
3029
+ return {
3030
+ ...strategy !== void 0 && { strategy },
3031
+ ...afterEach !== void 0 && { after_each: afterEach }
3032
+ };
3033
+ }
2823
3034
  function parseWorkspaceConfig(raw, evalFileDir) {
2824
3035
  if (!isJsonObject(raw)) return void 0;
2825
3036
  const obj = raw;
@@ -2827,13 +3038,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
2827
3038
  if (template && !path8.isAbsolute(template)) {
2828
3039
  template = path8.resolve(evalFileDir, template);
2829
3040
  }
3041
+ const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
3042
+ const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
3043
+ const reset = parseResetConfig(obj.reset);
2830
3044
  const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
2831
3045
  const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
2832
3046
  const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
2833
3047
  const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
2834
- if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach) return void 0;
3048
+ if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
3049
+ return void 0;
2835
3050
  return {
2836
3051
  ...template !== void 0 && { template },
3052
+ ...isolation !== void 0 && { isolation },
3053
+ ...repos !== void 0 && { repos },
3054
+ ...reset !== void 0 && { reset },
2837
3055
  ...beforeAll !== void 0 && { before_all: beforeAll },
2838
3056
  ...afterAll !== void 0 && { after_all: afterAll },
2839
3057
  ...beforeEach !== void 0 && { before_each: beforeEach },
@@ -2846,6 +3064,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
2846
3064
  if (!caseLevel) return suiteLevel;
2847
3065
  return {
2848
3066
  template: caseLevel.template ?? suiteLevel.template,
3067
+ isolation: caseLevel.isolation ?? suiteLevel.isolation,
3068
+ repos: caseLevel.repos ?? suiteLevel.repos,
3069
+ reset: caseLevel.reset ?? suiteLevel.reset,
2849
3070
  before_all: caseLevel.before_all ?? suiteLevel.before_all,
2850
3071
  after_all: caseLevel.after_all ?? suiteLevel.after_all,
2851
3072
  before_each: caseLevel.before_each ?? suiteLevel.before_each,
@@ -3385,11 +3606,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
3385
3606
  }
3386
3607
  return claudeSdkModule;
3387
3608
  }
3388
- var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
3389
- - Do NOT create any additional output files in the workspace.
3390
- - All intended file outputs/changes MUST be written in your response.
3391
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
3392
- This is required for evaluation scoring.`;
3393
3609
  var ClaudeProvider = class {
3394
3610
  id;
3395
3611
  kind = "claude";
@@ -3411,7 +3627,7 @@ var ClaudeProvider = class {
3411
3627
  const logger = await this.createStreamLogger(request).catch(() => void 0);
3412
3628
  const inputFiles = normalizeInputFiles(request.inputFiles);
3413
3629
  const prompt = buildPromptDocument(request, inputFiles);
3414
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT2);
3630
+ const systemPrompt = this.config.systemPrompt;
3415
3631
  const queryOptions = {
3416
3632
  permissionMode: "bypassPermissions",
3417
3633
  allowDangerouslySkipPermissions: true,
@@ -4392,11 +4608,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
4392
4608
  }
4393
4609
  return codexSdkModule;
4394
4610
  }
4395
- var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
4396
- - Do NOT create any additional output files in the workspace.
4397
- - All intended file outputs/changes MUST be written in your response.
4398
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
4399
- This is required for evaluation scoring.`;
4400
4611
  var CodexProvider = class {
4401
4612
  id;
4402
4613
  kind = "codex";
@@ -4431,7 +4642,7 @@ var CodexProvider = class {
4431
4642
  const thread = codex.startThread(threadOptions);
4432
4643
  const inputFiles = normalizeInputFiles(request.inputFiles);
4433
4644
  const basePrompt = buildPromptDocument(request, inputFiles);
4434
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT3);
4645
+ const systemPrompt = this.config.systemPrompt;
4435
4646
  const prompt = systemPrompt ? `${systemPrompt}
4436
4647
 
4437
4648
  ${basePrompt}` : basePrompt;
@@ -4797,7 +5008,7 @@ import { arch, platform } from "node:os";
4797
5008
  import path13 from "node:path";
4798
5009
  import { fileURLToPath as fileURLToPath2 } from "node:url";
4799
5010
  function resolvePlatformCliPath() {
4800
- const os4 = platform();
5011
+ const os5 = platform();
4801
5012
  const cpu = arch();
4802
5013
  const platformMap = {
4803
5014
  linux: "linux",
@@ -4808,13 +5019,13 @@ function resolvePlatformCliPath() {
4808
5019
  x64: "x64",
4809
5020
  arm64: "arm64"
4810
5021
  };
4811
- const osPart = platformMap[os4];
5022
+ const osPart = platformMap[os5];
4812
5023
  const archPart = archMap[cpu];
4813
5024
  if (!osPart || !archPart) {
4814
5025
  return void 0;
4815
5026
  }
4816
5027
  const packageName = `@github/copilot-${osPart}-${archPart}`;
4817
- const binaryName = os4 === "win32" ? "copilot.exe" : "copilot";
5028
+ const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
4818
5029
  try {
4819
5030
  const resolved = import.meta.resolve(`${packageName}/package.json`);
4820
5031
  const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
@@ -4956,11 +5167,6 @@ function isLogStreamingDisabled(envKey) {
4956
5167
  }
4957
5168
 
4958
5169
  // src/evaluation/providers/copilot-cli.ts
4959
- var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
4960
- - Do NOT create any additional output files in the workspace.
4961
- - All intended file outputs/changes MUST be written in your response.
4962
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
4963
- This is required for evaluation scoring.`;
4964
5170
  var CopilotCliProvider = class {
4965
5171
  id;
4966
5172
  kind = "copilot-cli";
@@ -5122,6 +5328,16 @@ var CopilotCliProvider = class {
5122
5328
  }
5123
5329
  const endTime = (/* @__PURE__ */ new Date()).toISOString();
5124
5330
  const durationMs = Date.now() - startMs;
5331
+ const rejectedCalls = completedToolCalls.filter((tc) => {
5332
+ const out = tc.output;
5333
+ return out && (out.code === "rejected" || out.code === "denied");
5334
+ });
5335
+ if (rejectedCalls.length > 0) {
5336
+ const tools = rejectedCalls.map((tc) => tc.tool).join(", ");
5337
+ throw new Error(
5338
+ `Copilot rejected ${rejectedCalls.length} tool call(s): ${tools}. Add args: ["--yolo"] to your target config or re-run with --yolo to bypass permission checks.`
5339
+ );
5340
+ }
5125
5341
  const outputMessages = [];
5126
5342
  if (completedToolCalls.length > 0) {
5127
5343
  outputMessages.push({
@@ -5154,7 +5370,7 @@ var CopilotCliProvider = class {
5154
5370
  }
5155
5371
  }
5156
5372
  buildCliArgs() {
5157
- const args = ["--acp", "--stdio", "--allow-all-tools"];
5373
+ const args = ["--acp", "--stdio", "--allow-all-tools", "--yolo"];
5158
5374
  if (this.config.model) {
5159
5375
  args.push("--model", this.config.model);
5160
5376
  }
@@ -5163,8 +5379,8 @@ var CopilotCliProvider = class {
5163
5379
  }
5164
5380
  return args;
5165
5381
  }
5166
- resolveSystemPrompt(request) {
5167
- return this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT4);
5382
+ resolveSystemPrompt(_request) {
5383
+ return this.config.systemPrompt;
5168
5384
  }
5169
5385
  async raceWithTimeout(sendPromise, agentProcess) {
5170
5386
  const timeoutMs = this.config.timeoutMs;
@@ -5352,21 +5568,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
5352
5568
  }
5353
5569
  return copilotSdkModule;
5354
5570
  }
5355
- var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
5356
- - Do NOT create any additional output files in the workspace.
5357
- - All intended file outputs/changes MUST be written in your response.
5358
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
5359
- This is required for evaluation scoring.`;
5360
5571
  var CopilotSdkProvider = class {
5361
5572
  id;
5362
- kind = "copilot";
5573
+ kind = "copilot-sdk";
5363
5574
  targetName;
5364
5575
  supportsBatch = false;
5365
5576
  config;
5366
5577
  // biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
5367
5578
  client = null;
5368
5579
  constructor(targetName, config) {
5369
- this.id = `copilot:${targetName}`;
5580
+ this.id = `copilot-sdk:${targetName}`;
5370
5581
  this.targetName = targetName;
5371
5582
  this.config = config;
5372
5583
  }
@@ -5389,7 +5600,7 @@ var CopilotSdkProvider = class {
5389
5600
  if (cwd) {
5390
5601
  sessionOptions.workingDirectory = cwd;
5391
5602
  }
5392
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT5);
5603
+ const systemPrompt = this.config.systemPrompt;
5393
5604
  if (systemPrompt) {
5394
5605
  sessionOptions.systemMessage = {
5395
5606
  mode: "append",
@@ -5905,11 +6116,6 @@ function subscribeToPiLogEntries(listener) {
5905
6116
  // src/evaluation/providers/pi-coding-agent.ts
5906
6117
  var WORKSPACE_PREFIX = "agentv-pi-";
5907
6118
  var PROMPT_FILENAME = "prompt.md";
5908
- var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
5909
- - Do NOT create any additional output files in the workspace.
5910
- - All intended file outputs/changes MUST be written in your response.
5911
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
5912
- This is required for evaluation scoring.`;
5913
6119
  var PiCodingAgentProvider = class {
5914
6120
  id;
5915
6121
  kind = "pi-coding-agent";
@@ -5986,7 +6192,7 @@ var PiCodingAgentProvider = class {
5986
6192
  }
5987
6193
  return path16.resolve(this.config.cwd);
5988
6194
  }
5989
- buildPiArgs(prompt, inputFiles, captureFileChanges2) {
6195
+ buildPiArgs(prompt, inputFiles, _captureFileChanges) {
5990
6196
  const args = [];
5991
6197
  if (this.config.provider) {
5992
6198
  args.push("--provider", this.config.provider);
@@ -6014,7 +6220,7 @@ var PiCodingAgentProvider = class {
6014
6220
  args.push(`@${file}`);
6015
6221
  }
6016
6222
  }
6017
- const systemPrompt = this.config.systemPrompt ?? (captureFileChanges2 ? void 0 : DEFAULT_SYSTEM_PROMPT6);
6223
+ const systemPrompt = this.config.systemPrompt;
6018
6224
  const fullPrompt = systemPrompt ? `${systemPrompt}
6019
6225
 
6020
6226
  ${prompt}` : prompt;
@@ -7708,7 +7914,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
7708
7914
 
7709
7915
  **IMPORTANT**: Follow these exact steps:
7710
7916
  1. Create and write your complete response to: {{responseFileTmp}}
7711
- - Do NOT create any additional output files in the workspace.
7712
7917
  - All intended file outputs/changes MUST be written in your response file.
7713
7918
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
7714
7919
  2. When completely finished, run these PowerShell commands to signal completion:
@@ -7727,7 +7932,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
7727
7932
 
7728
7933
  **IMPORTANT**: Follow these exact steps:
7729
7934
  1. Create and write your complete response to: {{responseFileTmp}}
7730
- - Do NOT create any additional output files in the workspace.
7731
7935
  - All intended file outputs/changes MUST be written in your response file.
7732
7936
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
7733
7937
  2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
@@ -8153,7 +8357,7 @@ async function discoverProviders(registry, baseDir) {
8153
8357
  // src/evaluation/providers/index.ts
8154
8358
  function createBuiltinProviderRegistry() {
8155
8359
  const registry = new ProviderRegistry();
8156
- registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
8360
+ registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
8157
8361
  "vscode-insiders",
8158
8362
  (t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
8159
8363
  );
@@ -8342,16 +8546,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
8342
8546
  });
8343
8547
  }
8344
8548
  async function execShellWithStdin(command, stdinPayload, options = {}) {
8345
- const { mkdir: mkdir13, readFile: readFile12, rm: rm5, writeFile: writeFile8 } = await import("node:fs/promises");
8549
+ const { mkdir: mkdir14, readFile: readFile12, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
8346
8550
  const { tmpdir: tmpdir3 } = await import("node:os");
8347
- const path39 = await import("node:path");
8551
+ const path40 = await import("node:path");
8348
8552
  const { randomUUID: randomUUID8 } = await import("node:crypto");
8349
- const dir = path39.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
8350
- await mkdir13(dir, { recursive: true });
8351
- const stdinPath = path39.join(dir, "stdin.txt");
8352
- const stdoutPath = path39.join(dir, "stdout.txt");
8353
- const stderrPath = path39.join(dir, "stderr.txt");
8354
- await writeFile8(stdinPath, stdinPayload, "utf8");
8553
+ const dir = path40.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
8554
+ await mkdir14(dir, { recursive: true });
8555
+ const stdinPath = path40.join(dir, "stdin.txt");
8556
+ const stdoutPath = path40.join(dir, "stdout.txt");
8557
+ const stderrPath = path40.join(dir, "stderr.txt");
8558
+ await writeFile9(stdinPath, stdinPayload, "utf8");
8355
8559
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
8356
8560
  const { spawn: spawn4 } = await import("node:child_process");
8357
8561
  try {
@@ -8384,7 +8588,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
8384
8588
  const stderr = (await readFile12(stderrPath, "utf8")).replace(/\r\n/g, "\n");
8385
8589
  return { stdout, stderr, exitCode };
8386
8590
  } finally {
8387
- await rm5(dir, { recursive: true, force: true });
8591
+ await rm6(dir, { recursive: true, force: true });
8388
8592
  }
8389
8593
  }
8390
8594
 
@@ -8702,7 +8906,7 @@ var CodeEvaluator = class {
8702
8906
  outputPath,
8703
8907
  guidelineFiles: context.evalCase.guideline_paths,
8704
8908
  inputFiles: context.evalCase.file_paths.filter(
8705
- (path39) => !context.evalCase.guideline_paths.includes(path39)
8909
+ (path40) => !context.evalCase.guideline_paths.includes(path40)
8706
8910
  ),
8707
8911
  input: context.evalCase.input,
8708
8912
  trace: context.trace ?? null,
@@ -8950,13 +9154,15 @@ ${context.fileChanges}`;
8950
9154
  evaluatorRawRequest,
8951
9155
  tokenUsage
8952
9156
  };
8953
- } catch {
9157
+ } catch (e) {
9158
+ const message = e instanceof Error ? e.message : String(e);
8954
9159
  return {
8955
9160
  score: 0,
8956
- verdict: "fail",
9161
+ verdict: "skip",
8957
9162
  hits: [],
8958
- misses: [],
9163
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
8959
9164
  expectedAspectCount: 1,
9165
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
8960
9166
  evaluatorRawRequest
8961
9167
  };
8962
9168
  }
@@ -9898,115 +10104,115 @@ var FieldAccuracyEvaluator = class {
9898
10104
  * Evaluate a single field against the expected value.
9899
10105
  */
9900
10106
  evaluateField(fieldConfig, candidateData, expectedData) {
9901
- const { path: path39, match, required = true, weight = 1 } = fieldConfig;
9902
- const candidateValue = resolvePath(candidateData, path39);
9903
- const expectedValue = resolvePath(expectedData, path39);
10107
+ const { path: path40, match, required = true, weight = 1 } = fieldConfig;
10108
+ const candidateValue = resolvePath(candidateData, path40);
10109
+ const expectedValue = resolvePath(expectedData, path40);
9904
10110
  if (expectedValue === void 0) {
9905
10111
  return {
9906
- path: path39,
10112
+ path: path40,
9907
10113
  score: 1,
9908
10114
  // No expected value means no comparison needed
9909
10115
  weight,
9910
10116
  hit: true,
9911
- message: `${path39}: no expected value`
10117
+ message: `${path40}: no expected value`
9912
10118
  };
9913
10119
  }
9914
10120
  if (candidateValue === void 0) {
9915
10121
  if (required) {
9916
10122
  return {
9917
- path: path39,
10123
+ path: path40,
9918
10124
  score: 0,
9919
10125
  weight,
9920
10126
  hit: false,
9921
- message: `${path39} (required, missing)`
10127
+ message: `${path40} (required, missing)`
9922
10128
  };
9923
10129
  }
9924
10130
  return {
9925
- path: path39,
10131
+ path: path40,
9926
10132
  score: 1,
9927
10133
  // Don't penalize missing optional fields
9928
10134
  weight: 0,
9929
10135
  // Zero weight means it won't affect the score
9930
10136
  hit: true,
9931
- message: `${path39}: optional field missing`
10137
+ message: `${path40}: optional field missing`
9932
10138
  };
9933
10139
  }
9934
10140
  switch (match) {
9935
10141
  case "exact":
9936
- return this.compareExact(path39, candidateValue, expectedValue, weight);
10142
+ return this.compareExact(path40, candidateValue, expectedValue, weight);
9937
10143
  case "numeric_tolerance":
9938
10144
  return this.compareNumericTolerance(
9939
- path39,
10145
+ path40,
9940
10146
  candidateValue,
9941
10147
  expectedValue,
9942
10148
  fieldConfig,
9943
10149
  weight
9944
10150
  );
9945
10151
  case "date":
9946
- return this.compareDate(path39, candidateValue, expectedValue, fieldConfig, weight);
10152
+ return this.compareDate(path40, candidateValue, expectedValue, fieldConfig, weight);
9947
10153
  default:
9948
10154
  return {
9949
- path: path39,
10155
+ path: path40,
9950
10156
  score: 0,
9951
10157
  weight,
9952
10158
  hit: false,
9953
- message: `${path39}: unknown match type "${match}"`
10159
+ message: `${path40}: unknown match type "${match}"`
9954
10160
  };
9955
10161
  }
9956
10162
  }
9957
10163
  /**
9958
10164
  * Exact equality comparison.
9959
10165
  */
9960
- compareExact(path39, candidateValue, expectedValue, weight) {
10166
+ compareExact(path40, candidateValue, expectedValue, weight) {
9961
10167
  if (deepEqual(candidateValue, expectedValue)) {
9962
10168
  return {
9963
- path: path39,
10169
+ path: path40,
9964
10170
  score: 1,
9965
10171
  weight,
9966
10172
  hit: true,
9967
- message: path39
10173
+ message: path40
9968
10174
  };
9969
10175
  }
9970
10176
  if (typeof candidateValue !== typeof expectedValue) {
9971
10177
  return {
9972
- path: path39,
10178
+ path: path40,
9973
10179
  score: 0,
9974
10180
  weight,
9975
10181
  hit: false,
9976
- message: `${path39} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
10182
+ message: `${path40} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
9977
10183
  };
9978
10184
  }
9979
10185
  return {
9980
- path: path39,
10186
+ path: path40,
9981
10187
  score: 0,
9982
10188
  weight,
9983
10189
  hit: false,
9984
- message: `${path39} (value mismatch)`
10190
+ message: `${path40} (value mismatch)`
9985
10191
  };
9986
10192
  }
9987
10193
  /**
9988
10194
  * Numeric comparison with absolute or relative tolerance.
9989
10195
  */
9990
- compareNumericTolerance(path39, candidateValue, expectedValue, fieldConfig, weight) {
10196
+ compareNumericTolerance(path40, candidateValue, expectedValue, fieldConfig, weight) {
9991
10197
  const { tolerance = 0, relative = false } = fieldConfig;
9992
10198
  const candidateNum = toNumber2(candidateValue);
9993
10199
  const expectedNum = toNumber2(expectedValue);
9994
10200
  if (candidateNum === null || expectedNum === null) {
9995
10201
  return {
9996
- path: path39,
10202
+ path: path40,
9997
10203
  score: 0,
9998
10204
  weight,
9999
10205
  hit: false,
10000
- message: `${path39} (non-numeric value)`
10206
+ message: `${path40} (non-numeric value)`
10001
10207
  };
10002
10208
  }
10003
10209
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
10004
10210
  return {
10005
- path: path39,
10211
+ path: path40,
10006
10212
  score: 0,
10007
10213
  weight,
10008
10214
  hit: false,
10009
- message: `${path39} (invalid numeric value)`
10215
+ message: `${path40} (invalid numeric value)`
10010
10216
  };
10011
10217
  }
10012
10218
  const diff = Math.abs(candidateNum - expectedNum);
@@ -10019,61 +10225,61 @@ var FieldAccuracyEvaluator = class {
10019
10225
  }
10020
10226
  if (withinTolerance) {
10021
10227
  return {
10022
- path: path39,
10228
+ path: path40,
10023
10229
  score: 1,
10024
10230
  weight,
10025
10231
  hit: true,
10026
- message: `${path39} (within tolerance: diff=${diff.toFixed(2)})`
10232
+ message: `${path40} (within tolerance: diff=${diff.toFixed(2)})`
10027
10233
  };
10028
10234
  }
10029
10235
  return {
10030
- path: path39,
10236
+ path: path40,
10031
10237
  score: 0,
10032
10238
  weight,
10033
10239
  hit: false,
10034
- message: `${path39} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
10240
+ message: `${path40} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
10035
10241
  };
10036
10242
  }
10037
10243
  /**
10038
10244
  * Date comparison with format normalization.
10039
10245
  */
10040
- compareDate(path39, candidateValue, expectedValue, fieldConfig, weight) {
10246
+ compareDate(path40, candidateValue, expectedValue, fieldConfig, weight) {
10041
10247
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
10042
10248
  const candidateDate = parseDate(String(candidateValue), formats);
10043
10249
  const expectedDate = parseDate(String(expectedValue), formats);
10044
10250
  if (candidateDate === null) {
10045
10251
  return {
10046
- path: path39,
10252
+ path: path40,
10047
10253
  score: 0,
10048
10254
  weight,
10049
10255
  hit: false,
10050
- message: `${path39} (unparseable candidate date)`
10256
+ message: `${path40} (unparseable candidate date)`
10051
10257
  };
10052
10258
  }
10053
10259
  if (expectedDate === null) {
10054
10260
  return {
10055
- path: path39,
10261
+ path: path40,
10056
10262
  score: 0,
10057
10263
  weight,
10058
10264
  hit: false,
10059
- message: `${path39} (unparseable expected date)`
10265
+ message: `${path40} (unparseable expected date)`
10060
10266
  };
10061
10267
  }
10062
10268
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
10063
10269
  return {
10064
- path: path39,
10270
+ path: path40,
10065
10271
  score: 1,
10066
10272
  weight,
10067
10273
  hit: true,
10068
- message: path39
10274
+ message: path40
10069
10275
  };
10070
10276
  }
10071
10277
  return {
10072
- path: path39,
10278
+ path: path40,
10073
10279
  score: 0,
10074
10280
  weight,
10075
10281
  hit: false,
10076
- message: `${path39} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
10282
+ message: `${path40} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
10077
10283
  };
10078
10284
  }
10079
10285
  /**
@@ -10114,11 +10320,11 @@ var FieldAccuracyEvaluator = class {
10114
10320
  };
10115
10321
  }
10116
10322
  };
10117
- function resolvePath(obj, path39) {
10118
- if (!path39 || !obj) {
10323
+ function resolvePath(obj, path40) {
10324
+ if (!path40 || !obj) {
10119
10325
  return void 0;
10120
10326
  }
10121
- const parts = path39.split(/\.|\[|\]/).filter((p) => p.length > 0);
10327
+ const parts = path40.split(/\.|\[|\]/).filter((p) => p.length > 0);
10122
10328
  let current = obj;
10123
10329
  for (const part of parts) {
10124
10330
  if (current === null || current === void 0) {
@@ -10936,8 +11142,8 @@ var TokenUsageEvaluator = class {
10936
11142
  };
10937
11143
 
10938
11144
  // src/evaluation/evaluators/tool-trajectory.ts
10939
- function getNestedValue(obj, path39) {
10940
- const parts = path39.split(".");
11145
+ function getNestedValue(obj, path40) {
11146
+ const parts = path40.split(".");
10941
11147
  let current = obj;
10942
11148
  for (const part of parts) {
10943
11149
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -11401,13 +11607,78 @@ function runContainsAssertion(output, value) {
11401
11607
  misses: passed ? [] : [`Output does not contain "${value}"`]
11402
11608
  };
11403
11609
  }
11404
- function runRegexAssertion(output, pattern) {
11405
- const regex = new RegExp(pattern);
11610
+ function runContainsAnyAssertion(output, values) {
11611
+ const matched = values.filter((v) => output.includes(v));
11612
+ const passed = matched.length > 0;
11613
+ return {
11614
+ score: passed ? 1 : 0,
11615
+ hits: passed ? [`Output contains "${matched[0]}"`] : [],
11616
+ misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
11617
+ };
11618
+ }
11619
+ function runContainsAllAssertion(output, values) {
11620
+ const missing = values.filter((v) => !output.includes(v));
11621
+ const passed = missing.length === 0;
11622
+ return {
11623
+ score: passed ? 1 : 0,
11624
+ hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
11625
+ misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
11626
+ };
11627
+ }
11628
+ function runIcontainsAssertion(output, value) {
11629
+ const passed = output.toLowerCase().includes(value.toLowerCase());
11630
+ return {
11631
+ score: passed ? 1 : 0,
11632
+ hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
11633
+ misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
11634
+ };
11635
+ }
11636
+ function runIcontainsAnyAssertion(output, values) {
11637
+ const lower = output.toLowerCase();
11638
+ const matched = values.filter((v) => lower.includes(v.toLowerCase()));
11639
+ const passed = matched.length > 0;
11640
+ return {
11641
+ score: passed ? 1 : 0,
11642
+ hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
11643
+ misses: passed ? [] : [
11644
+ `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
11645
+ ]
11646
+ };
11647
+ }
11648
+ function runIcontainsAllAssertion(output, values) {
11649
+ const lower = output.toLowerCase();
11650
+ const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
11651
+ const passed = missing.length === 0;
11652
+ return {
11653
+ score: passed ? 1 : 0,
11654
+ hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
11655
+ misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
11656
+ };
11657
+ }
11658
+ function runStartsWithAssertion(output, value) {
11659
+ const passed = output.trim().startsWith(value.trim());
11660
+ return {
11661
+ score: passed ? 1 : 0,
11662
+ hits: passed ? [`Output starts with "${value}"`] : [],
11663
+ misses: passed ? [] : [`Output does not start with "${value}"`]
11664
+ };
11665
+ }
11666
+ function runEndsWithAssertion(output, value) {
11667
+ const passed = output.trim().endsWith(value.trim());
11668
+ return {
11669
+ score: passed ? 1 : 0,
11670
+ hits: passed ? [`Output ends with "${value}"`] : [],
11671
+ misses: passed ? [] : [`Output does not end with "${value}"`]
11672
+ };
11673
+ }
11674
+ function runRegexAssertion(output, pattern, flags) {
11675
+ const regex = new RegExp(pattern, flags);
11406
11676
  const passed = regex.test(output);
11677
+ const flagsLabel = flags ? ` (flags: ${flags})` : "";
11407
11678
  return {
11408
11679
  score: passed ? 1 : 0,
11409
- hits: passed ? [`Output matches pattern /${pattern}/`] : [],
11410
- misses: passed ? [] : [`Output does not match pattern /${pattern}/`]
11680
+ hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
11681
+ misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
11411
11682
  };
11412
11683
  }
11413
11684
  function runIsJsonAssertion(output) {
@@ -11433,9 +11704,9 @@ function runEqualsAssertion(output, value) {
11433
11704
  }
11434
11705
 
11435
11706
  // src/evaluation/orchestrator.ts
11436
- import { createHash, randomUUID as randomUUID7 } from "node:crypto";
11437
- import { mkdir as mkdir11 } from "node:fs/promises";
11438
- import path36 from "node:path";
11707
+ import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
11708
+ import { mkdir as mkdir12 } from "node:fs/promises";
11709
+ import path37 from "node:path";
11439
11710
  import micromatch4 from "micromatch";
11440
11711
 
11441
11712
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -11825,13 +12096,13 @@ var containsFactory = (config) => {
11825
12096
  var regexFactory = (config) => {
11826
12097
  const c = config;
11827
12098
  return new DeterministicAssertionEvaluator("regex", (ctx) => {
11828
- const result = runRegexAssertion(ctx.candidate, c.value);
12099
+ const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
11829
12100
  return {
11830
12101
  score: result.score,
11831
12102
  verdict: result.score === 1 ? "pass" : "fail",
11832
12103
  hits: result.hits,
11833
12104
  misses: result.misses,
11834
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/` : `Output does not match pattern /${c.value}/`,
12105
+ reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
11835
12106
  expectedAspectCount: 1
11836
12107
  };
11837
12108
  });
@@ -11863,9 +12134,107 @@ var equalsFactory = (config) => {
11863
12134
  };
11864
12135
  });
11865
12136
  };
12137
+ var containsAnyFactory = (config) => {
12138
+ const c = config;
12139
+ return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
12140
+ const result = runContainsAnyAssertion(ctx.candidate, c.value);
12141
+ return {
12142
+ score: result.score,
12143
+ verdict: result.score === 1 ? "pass" : "fail",
12144
+ hits: result.hits,
12145
+ misses: result.misses,
12146
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12147
+ expectedAspectCount: 1
12148
+ };
12149
+ });
12150
+ };
12151
+ var containsAllFactory = (config) => {
12152
+ const c = config;
12153
+ return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
12154
+ const result = runContainsAllAssertion(ctx.candidate, c.value);
12155
+ return {
12156
+ score: result.score,
12157
+ verdict: result.score === 1 ? "pass" : "fail",
12158
+ hits: result.hits,
12159
+ misses: result.misses,
12160
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12161
+ expectedAspectCount: 1
12162
+ };
12163
+ });
12164
+ };
12165
+ var icontainsFactory = (config) => {
12166
+ const c = config;
12167
+ return new DeterministicAssertionEvaluator("icontains", (ctx) => {
12168
+ const result = runIcontainsAssertion(ctx.candidate, c.value);
12169
+ return {
12170
+ score: result.score,
12171
+ verdict: result.score === 1 ? "pass" : "fail",
12172
+ hits: result.hits,
12173
+ misses: result.misses,
12174
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12175
+ expectedAspectCount: 1
12176
+ };
12177
+ });
12178
+ };
12179
+ var icontainsAnyFactory = (config) => {
12180
+ const c = config;
12181
+ return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
12182
+ const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
12183
+ return {
12184
+ score: result.score,
12185
+ verdict: result.score === 1 ? "pass" : "fail",
12186
+ hits: result.hits,
12187
+ misses: result.misses,
12188
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12189
+ expectedAspectCount: 1
12190
+ };
12191
+ });
12192
+ };
12193
+ var icontainsAllFactory = (config) => {
12194
+ const c = config;
12195
+ return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
12196
+ const result = runIcontainsAllAssertion(ctx.candidate, c.value);
12197
+ return {
12198
+ score: result.score,
12199
+ verdict: result.score === 1 ? "pass" : "fail",
12200
+ hits: result.hits,
12201
+ misses: result.misses,
12202
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12203
+ expectedAspectCount: 1
12204
+ };
12205
+ });
12206
+ };
12207
+ var startsWithFactory = (config) => {
12208
+ const c = config;
12209
+ return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
12210
+ const result = runStartsWithAssertion(ctx.candidate, c.value);
12211
+ return {
12212
+ score: result.score,
12213
+ verdict: result.score === 1 ? "pass" : "fail",
12214
+ hits: result.hits,
12215
+ misses: result.misses,
12216
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12217
+ expectedAspectCount: 1
12218
+ };
12219
+ });
12220
+ };
12221
+ var endsWithFactory = (config) => {
12222
+ const c = config;
12223
+ return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
12224
+ const result = runEndsWithAssertion(ctx.candidate, c.value);
12225
+ return {
12226
+ score: result.score,
12227
+ verdict: result.score === 1 ? "pass" : "fail",
12228
+ hits: result.hits,
12229
+ misses: result.misses,
12230
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12231
+ expectedAspectCount: 1
12232
+ };
12233
+ });
12234
+ };
11866
12235
  function createBuiltinRegistry() {
11867
12236
  const registry = new EvaluatorRegistry();
11868
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
12237
+ registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
11869
12238
  return registry;
11870
12239
  }
11871
12240
 
@@ -12209,18 +12578,236 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
12209
12578
  }
12210
12579
  }
12211
12580
 
12581
+ // src/evaluation/workspace/repo-manager.ts
12582
+ import { execFile } from "node:child_process";
12583
+ import { createHash } from "node:crypto";
12584
+ import { existsSync as existsSync2 } from "node:fs";
12585
+ import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
12586
+ import os4 from "node:os";
12587
+ import path35 from "node:path";
12588
+ import { promisify as promisify5 } from "node:util";
12589
+ var execFileAsync = promisify5(execFile);
12590
+ var DEFAULT_CACHE_DIR = path35.join(os4.homedir(), ".agentv", "git-cache");
12591
+ var DEFAULT_TIMEOUT_MS2 = 3e5;
12592
+ var LOCK_TIMEOUT_MS = 6e4;
12593
+ function gitEnv() {
12594
+ const env = { ...process.env };
12595
+ for (const key of Object.keys(env)) {
12596
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
12597
+ delete env[key];
12598
+ }
12599
+ }
12600
+ return {
12601
+ ...env,
12602
+ GIT_TERMINAL_PROMPT: "0",
12603
+ GIT_ASKPASS: "",
12604
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
12605
+ };
12606
+ }
12607
+ function cacheKey(source) {
12608
+ const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
12609
+ return createHash("sha256").update(raw).digest("hex");
12610
+ }
12611
+ function getSourceUrl(source) {
12612
+ return source.type === "git" ? source.url : source.path;
12613
+ }
12614
+ async function git(args, opts) {
12615
+ const { stdout } = await execFileAsync("git", args, {
12616
+ cwd: opts?.cwd,
12617
+ timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
12618
+ env: gitEnv(),
12619
+ maxBuffer: 50 * 1024 * 1024
12620
+ // 50MB
12621
+ });
12622
+ return stdout.trim();
12623
+ }
12624
+ async function acquireLock(lockPath) {
12625
+ const start = Date.now();
12626
+ while (Date.now() - start < LOCK_TIMEOUT_MS) {
12627
+ try {
12628
+ await writeFile7(lockPath, String(process.pid), { flag: "wx" });
12629
+ return;
12630
+ } catch (err) {
12631
+ if (err.code === "EEXIST") {
12632
+ await new Promise((r) => setTimeout(r, 200));
12633
+ continue;
12634
+ }
12635
+ throw err;
12636
+ }
12637
+ }
12638
+ throw new Error(`Timed out waiting for lock: ${lockPath}`);
12639
+ }
12640
+ async function releaseLock(lockPath) {
12641
+ try {
12642
+ await unlink(lockPath);
12643
+ } catch {
12644
+ }
12645
+ }
12646
+ var RepoManager = class {
12647
+ cacheDir;
12648
+ constructor(cacheDir) {
12649
+ this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
12650
+ }
12651
+ /**
12652
+ * Ensure a bare mirror cache exists for the given source.
12653
+ * Creates on first access, fetches updates on subsequent calls.
12654
+ * Returns the absolute path to the cache directory.
12655
+ */
12656
+ async ensureCache(source, depth) {
12657
+ const key = cacheKey(source);
12658
+ const cachePath = path35.join(this.cacheDir, key);
12659
+ const lockPath = `${cachePath}.lock`;
12660
+ await mkdir11(this.cacheDir, { recursive: true });
12661
+ await acquireLock(lockPath);
12662
+ try {
12663
+ if (existsSync2(path35.join(cachePath, "HEAD"))) {
12664
+ const fetchArgs = ["fetch", "--prune"];
12665
+ if (depth) {
12666
+ fetchArgs.push("--depth", String(depth));
12667
+ }
12668
+ await git(fetchArgs, { cwd: cachePath });
12669
+ } else {
12670
+ const cloneArgs = ["clone", "--mirror", "--bare"];
12671
+ if (depth) {
12672
+ cloneArgs.push("--depth", String(depth));
12673
+ }
12674
+ const sourceUrl = getSourceUrl(source);
12675
+ const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
12676
+ cloneArgs.push(cloneUrl, cachePath);
12677
+ await git(cloneArgs);
12678
+ }
12679
+ } finally {
12680
+ await releaseLock(lockPath);
12681
+ }
12682
+ return cachePath;
12683
+ }
12684
+ /**
12685
+ * Clone a repo from cache into the workspace at the configured path.
12686
+ * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
12687
+ */
12688
+ async materialize(repo, workspacePath) {
12689
+ const targetDir = path35.join(workspacePath, repo.path);
12690
+ const cachePath = await this.ensureCache(repo.source, repo.clone?.depth);
12691
+ const cloneArgs = ["clone"];
12692
+ if (repo.clone?.depth) {
12693
+ cloneArgs.push("--depth", String(repo.clone.depth));
12694
+ }
12695
+ if (repo.clone?.filter) {
12696
+ cloneArgs.push("--filter", repo.clone.filter);
12697
+ }
12698
+ cloneArgs.push("--no-checkout");
12699
+ const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
12700
+ cloneArgs.push(cloneUrl, targetDir);
12701
+ await git(cloneArgs);
12702
+ if (repo.clone?.sparse?.length) {
12703
+ await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
12704
+ await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
12705
+ }
12706
+ const ref = repo.checkout?.ref ?? "HEAD";
12707
+ const resolve = repo.checkout?.resolve ?? "remote";
12708
+ let resolvedSha;
12709
+ if (resolve === "remote" && repo.source.type === "git") {
12710
+ const url = getSourceUrl(repo.source);
12711
+ try {
12712
+ const lsOutput = await git(["ls-remote", url, ref]);
12713
+ const match = lsOutput.split(" ")[0];
12714
+ if (!match) {
12715
+ throw new Error(`Ref '${ref}' not found on remote ${url}`);
12716
+ }
12717
+ resolvedSha = match;
12718
+ } catch (err) {
12719
+ if (err instanceof Error && err.message.includes("not found")) throw err;
12720
+ resolvedSha = ref;
12721
+ }
12722
+ } else {
12723
+ resolvedSha = ref;
12724
+ }
12725
+ await git(["checkout", resolvedSha], { cwd: targetDir });
12726
+ const ancestor = repo.checkout?.ancestor ?? 0;
12727
+ if (ancestor > 0) {
12728
+ try {
12729
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
12730
+ await git(["checkout", ancestorSha], { cwd: targetDir });
12731
+ } catch {
12732
+ if (repo.clone?.depth) {
12733
+ await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
12734
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
12735
+ await git(["checkout", ancestorSha], { cwd: targetDir });
12736
+ } else {
12737
+ throw new Error(
12738
+ `Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
12739
+ );
12740
+ }
12741
+ }
12742
+ }
12743
+ }
12744
+ /** Materialize all repos into the workspace. */
12745
+ async materializeAll(repos, workspacePath) {
12746
+ for (const repo of repos) {
12747
+ await this.materialize(repo, workspacePath);
12748
+ }
12749
+ }
12750
+ /** Reset repos in workspace to their checkout state. */
12751
+ async reset(repos, workspacePath, strategy) {
12752
+ if (strategy === "recreate") {
12753
+ for (const repo of repos) {
12754
+ const targetDir = path35.join(workspacePath, repo.path);
12755
+ await rm5(targetDir, { recursive: true, force: true });
12756
+ }
12757
+ await this.materializeAll(repos, workspacePath);
12758
+ return;
12759
+ }
12760
+ for (const repo of repos) {
12761
+ const targetDir = path35.join(workspacePath, repo.path);
12762
+ await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
12763
+ await git(["clean", "-fd"], { cwd: targetDir });
12764
+ }
12765
+ }
12766
+ /**
12767
+ * Seed the cache from a local repository, setting the remote to a given URL.
12768
+ * Useful for avoiding slow network clones when a local clone already exists.
12769
+ */
12770
+ async seedCache(localPath, remoteUrl, opts) {
12771
+ const source = { type: "git", url: remoteUrl };
12772
+ const key = cacheKey(source);
12773
+ const cachePath = path35.join(this.cacheDir, key);
12774
+ const lockPath = `${cachePath}.lock`;
12775
+ await mkdir11(this.cacheDir, { recursive: true });
12776
+ await acquireLock(lockPath);
12777
+ try {
12778
+ if (existsSync2(path35.join(cachePath, "HEAD"))) {
12779
+ if (!opts?.force) {
12780
+ throw new Error(
12781
+ `Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
12782
+ );
12783
+ }
12784
+ await rm5(cachePath, { recursive: true, force: true });
12785
+ }
12786
+ await git(["clone", "--mirror", "--bare", localPath, cachePath]);
12787
+ await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
12788
+ } finally {
12789
+ await releaseLock(lockPath);
12790
+ }
12791
+ return cachePath;
12792
+ }
12793
+ /** Remove the entire cache directory. */
12794
+ async cleanCache() {
12795
+ await rm5(this.cacheDir, { recursive: true, force: true });
12796
+ }
12797
+ };
12798
+
12212
12799
  // src/evaluation/workspace/resolve.ts
12213
12800
  import { readdir as readdir4, stat as stat6 } from "node:fs/promises";
12214
- import path35 from "node:path";
12801
+ import path36 from "node:path";
12215
12802
  async function resolveWorkspaceTemplate(templatePath) {
12216
12803
  if (!templatePath) {
12217
12804
  return void 0;
12218
12805
  }
12219
- const resolved = path35.resolve(templatePath);
12806
+ const resolved = path36.resolve(templatePath);
12220
12807
  const stats = await stat6(resolved);
12221
12808
  if (stats.isFile()) {
12222
12809
  return {
12223
- dir: path35.dirname(resolved),
12810
+ dir: path36.dirname(resolved),
12224
12811
  workspaceFile: resolved
12225
12812
  };
12226
12813
  }
@@ -12232,14 +12819,14 @@ async function resolveWorkspaceTemplate(templatePath) {
12232
12819
  if (workspaceFiles.length === 1) {
12233
12820
  return {
12234
12821
  dir: resolved,
12235
- workspaceFile: path35.join(resolved, workspaceFiles[0])
12822
+ workspaceFile: path36.join(resolved, workspaceFiles[0])
12236
12823
  };
12237
12824
  }
12238
12825
  if (workspaceFiles.length > 1) {
12239
12826
  const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
12240
12827
  return {
12241
12828
  dir: resolved,
12242
- workspaceFile: conventionFile ? path35.join(resolved, conventionFile) : void 0
12829
+ workspaceFile: conventionFile ? path36.join(resolved, conventionFile) : void 0
12243
12830
  };
12244
12831
  }
12245
12832
  return { dir: resolved };
@@ -12361,6 +12948,11 @@ async function runEvaluation(options) {
12361
12948
  }
12362
12949
  return getOrCreateProvider(resolvedJudge);
12363
12950
  };
12951
+ if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
12952
+ throw new Error(
12953
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
12954
+ );
12955
+ }
12364
12956
  const targetResolver = (name) => {
12365
12957
  const resolved = resolveTargetByName(name);
12366
12958
  if (!resolved) {
@@ -12374,7 +12966,7 @@ async function runEvaluation(options) {
12374
12966
  ];
12375
12967
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
12376
12968
  const typeRegistry = createBuiltinRegistry();
12377
- const discoveryBaseDir = evalFilePath ? path36.dirname(path36.resolve(evalFilePath)) : process.cwd();
12969
+ const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
12378
12970
  await discoverAssertions(typeRegistry, discoveryBaseDir);
12379
12971
  const providerRegistry = createBuiltinProviderRegistry();
12380
12972
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -12429,7 +13021,8 @@ async function runEvaluation(options) {
12429
13021
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
12430
13022
  const workspaceTemplate = resolvedTemplate?.dir;
12431
13023
  const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
12432
- const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all);
13024
+ const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
13025
+ const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
12433
13026
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
12434
13027
  const workers = hasSharedWorkspace ? 1 : requestedWorkers;
12435
13028
  if (hasSharedWorkspace && requestedWorkers > 1) {
@@ -12448,9 +13041,22 @@ async function runEvaluation(options) {
12448
13041
  const message = error instanceof Error ? error.message : String(error);
12449
13042
  throw new Error(`Failed to create shared workspace: ${message}`);
12450
13043
  }
12451
- } else if (suiteWorkspace?.before_all) {
13044
+ } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
12452
13045
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
12453
- await mkdir11(sharedWorkspacePath, { recursive: true });
13046
+ await mkdir12(sharedWorkspacePath, { recursive: true });
13047
+ }
13048
+ const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
13049
+ if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
13050
+ try {
13051
+ await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
13052
+ } catch (error) {
13053
+ const message = error instanceof Error ? error.message : String(error);
13054
+ if (sharedWorkspacePath) {
13055
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
13056
+ });
13057
+ }
13058
+ throw new Error(`Failed to materialize repos: ${message}`);
13059
+ }
12454
13060
  }
12455
13061
  if (sharedWorkspacePath && suiteWorkspace?.before_all) {
12456
13062
  const scriptContext = {
@@ -12541,7 +13147,8 @@ async function runEvaluation(options) {
12541
13147
  sharedBaselineCommit,
12542
13148
  suiteWorkspaceFile,
12543
13149
  streamCallbacks,
12544
- typeRegistry
13150
+ typeRegistry,
13151
+ repoManager
12545
13152
  };
12546
13153
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
12547
13154
  if (totalBudgetUsd !== void 0) {
@@ -12816,15 +13423,16 @@ async function runEvalCase(options) {
12816
13423
  sharedWorkspacePath,
12817
13424
  sharedBaselineCommit,
12818
13425
  suiteWorkspaceFile,
12819
- typeRegistry: providedTypeRegistry
13426
+ typeRegistry: providedTypeRegistry,
13427
+ repoManager
12820
13428
  } = options;
12821
13429
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
12822
13430
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
12823
13431
  const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
12824
- const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
13432
+ const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
12825
13433
  let cachedResponse;
12826
- if (cacheKey && cache) {
12827
- cachedResponse = await cache.get(cacheKey);
13434
+ if (cacheKey2 && cache) {
13435
+ cachedResponse = await cache.get(cacheKey2);
12828
13436
  }
12829
13437
  const nowFn = now ?? (() => /* @__PURE__ */ new Date());
12830
13438
  let workspacePath = sharedWorkspacePath;
@@ -12853,9 +13461,25 @@ async function runEvalCase(options) {
12853
13461
  );
12854
13462
  }
12855
13463
  }
12856
- if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
13464
+ if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
12857
13465
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
12858
- await mkdir11(workspacePath, { recursive: true });
13466
+ await mkdir12(workspacePath, { recursive: true });
13467
+ }
13468
+ if (evalCase.workspace?.repos?.length && workspacePath) {
13469
+ const perCaseRepoManager = new RepoManager();
13470
+ try {
13471
+ await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
13472
+ } catch (error) {
13473
+ const message = error instanceof Error ? error.message : String(error);
13474
+ return buildErrorResult(
13475
+ evalCase,
13476
+ target.name,
13477
+ nowFn(),
13478
+ new Error(`Failed to materialize repos: ${message}`),
13479
+ promptInputs,
13480
+ provider
13481
+ );
13482
+ }
12859
13483
  }
12860
13484
  if (workspacePath && evalCase.workspace?.before_all) {
12861
13485
  const scriptContext = {
@@ -12979,8 +13603,8 @@ async function runEvalCase(options) {
12979
13603
  }
12980
13604
  return errorResult;
12981
13605
  }
12982
- if (cacheKey && cache && !cachedResponse) {
12983
- await cache.set(cacheKey, providerResponse);
13606
+ if (cacheKey2 && cache && !cachedResponse) {
13607
+ await cache.set(cacheKey2, providerResponse);
12984
13608
  }
12985
13609
  const output = providerResponse.output;
12986
13610
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
@@ -13008,6 +13632,16 @@ async function runEvalCase(options) {
13008
13632
  }
13009
13633
  }
13010
13634
  const providerError = extractProviderError(providerResponse);
13635
+ if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
13636
+ try {
13637
+ await repoManager.reset(
13638
+ evalCase.workspace.repos,
13639
+ workspacePath,
13640
+ evalCase.workspace.reset.strategy
13641
+ );
13642
+ } catch {
13643
+ }
13644
+ }
13011
13645
  if (workspacePath && evalCase.workspace?.after_each) {
13012
13646
  const scriptContext = {
13013
13647
  workspacePath,
@@ -13372,7 +14006,7 @@ async function runEvaluatorList(options) {
13372
14006
  fileChanges,
13373
14007
  workspacePath
13374
14008
  };
13375
- const evalFileDir = evalCase.guideline_paths[0] ? path36.dirname(evalCase.guideline_paths[0]) : process.cwd();
14009
+ const evalFileDir = evalCase.guideline_paths[0] ? path37.dirname(evalCase.guideline_paths[0]) : process.cwd();
13376
14010
  const dispatchContext = {
13377
14011
  judgeProvider,
13378
14012
  targetResolver,
@@ -13462,8 +14096,9 @@ async function runEvaluatorList(options) {
13462
14096
  const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
13463
14097
  return entry.score.score < minScore;
13464
14098
  });
13465
- const aggregateScore = hasRequiredFailure ? 0 : scored.length > 0 ? computeWeightedMean(
13466
- scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
14099
+ const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
14100
+ const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
14101
+ scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
13467
14102
  ) : 0;
13468
14103
  const hits = scored.flatMap((entry) => entry.score.hits);
13469
14104
  const misses = scored.flatMap((entry) => entry.score.misses);
@@ -13603,7 +14238,7 @@ function extractProviderError(response) {
13603
14238
  return trimmed.length > 0 ? trimmed : void 0;
13604
14239
  }
13605
14240
  function createCacheKey(provider, target, evalCase, promptInputs) {
13606
- const hash = createHash("sha256");
14241
+ const hash = createHash2("sha256");
13607
14242
  hash.update(provider.id);
13608
14243
  hash.update(target.name);
13609
14244
  hash.update(evalCase.id);
@@ -13671,8 +14306,8 @@ function computeWeightedMean(entries) {
13671
14306
  }
13672
14307
 
13673
14308
  // src/evaluation/evaluate.ts
13674
- import { existsSync as existsSync2 } from "node:fs";
13675
- import path37 from "node:path";
14309
+ import { existsSync as existsSync3 } from "node:fs";
14310
+ import path38 from "node:path";
13676
14311
  async function evaluate(config) {
13677
14312
  const startTime = Date.now();
13678
14313
  if (config.tests && config.specFile) {
@@ -13694,13 +14329,13 @@ async function evaluate(config) {
13694
14329
  let evalCases;
13695
14330
  let testFilePath;
13696
14331
  if (config.specFile) {
13697
- testFilePath = path37.resolve(config.specFile);
14332
+ testFilePath = path38.resolve(config.specFile);
13698
14333
  evalCases = await loadTests(testFilePath, repoRoot, {
13699
14334
  verbose: config.verbose,
13700
14335
  filter: config.filter
13701
14336
  });
13702
14337
  } else {
13703
- testFilePath = path37.join(process.cwd(), "__programmatic__.yaml");
14338
+ testFilePath = path38.join(process.cwd(), "__programmatic__.yaml");
13704
14339
  evalCases = (config.tests ?? []).map((test) => {
13705
14340
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
13706
14341
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
@@ -13791,11 +14426,11 @@ function computeSummary(results, durationMs) {
13791
14426
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
13792
14427
  async function discoverDefaultTarget(repoRoot) {
13793
14428
  const cwd = process.cwd();
13794
- const chain = buildDirectoryChain(path37.join(cwd, "_placeholder"), repoRoot);
14429
+ const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
13795
14430
  for (const dir of chain) {
13796
14431
  for (const candidate of TARGET_FILE_CANDIDATES) {
13797
- const targetsPath = path37.join(dir, candidate);
13798
- if (!existsSync2(targetsPath)) continue;
14432
+ const targetsPath = path38.join(dir, candidate);
14433
+ if (!existsSync3(targetsPath)) continue;
13799
14434
  try {
13800
14435
  const definitions = await readTargetDefinitions(targetsPath);
13801
14436
  const defaultTarget = definitions.find((d) => d.name === "default");
@@ -13809,11 +14444,11 @@ async function discoverDefaultTarget(repoRoot) {
13809
14444
  async function loadEnvHierarchy(repoRoot) {
13810
14445
  const { readFileSync: readFileSync2 } = await import("node:fs");
13811
14446
  const cwd = process.cwd();
13812
- const chain = buildDirectoryChain(path37.join(cwd, "_placeholder"), repoRoot);
14447
+ const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
13813
14448
  const envFiles = [];
13814
14449
  for (const dir of chain) {
13815
- const envPath = path37.join(dir, ".env");
13816
- if (existsSync2(envPath)) envFiles.push(envPath);
14450
+ const envPath = path38.join(dir, ".env");
14451
+ if (existsSync3(envPath)) envFiles.push(envPath);
13817
14452
  }
13818
14453
  for (let i = envFiles.length - 1; i >= 0; i--) {
13819
14454
  try {
@@ -13883,12 +14518,12 @@ var CONFIG_FILE_NAMES = [
13883
14518
  ".agentv/config.js"
13884
14519
  ];
13885
14520
  async function loadTsConfig(projectRoot) {
13886
- const { existsSync: existsSync3 } = await import("node:fs");
14521
+ const { existsSync: existsSync4 } = await import("node:fs");
13887
14522
  const { pathToFileURL } = await import("node:url");
13888
14523
  const { join: join2 } = await import("node:path");
13889
14524
  for (const fileName of CONFIG_FILE_NAMES) {
13890
14525
  const filePath = join2(projectRoot, fileName);
13891
- if (!existsSync3(filePath)) {
14526
+ if (!existsSync4(filePath)) {
13892
14527
  continue;
13893
14528
  }
13894
14529
  try {
@@ -13985,8 +14620,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
13985
14620
  }
13986
14621
 
13987
14622
  // src/evaluation/cache/response-cache.ts
13988
- import { mkdir as mkdir12, readFile as readFile11, writeFile as writeFile7 } from "node:fs/promises";
13989
- import path38 from "node:path";
14623
+ import { mkdir as mkdir13, readFile as readFile11, writeFile as writeFile8 } from "node:fs/promises";
14624
+ import path39 from "node:path";
13990
14625
  var DEFAULT_CACHE_PATH = ".agentv/cache";
13991
14626
  var ResponseCache = class {
13992
14627
  cachePath;
@@ -14004,13 +14639,13 @@ var ResponseCache = class {
14004
14639
  }
14005
14640
  async set(key, value) {
14006
14641
  const filePath = this.keyToPath(key);
14007
- const dir = path38.dirname(filePath);
14008
- await mkdir12(dir, { recursive: true });
14009
- await writeFile7(filePath, JSON.stringify(value, null, 2), "utf8");
14642
+ const dir = path39.dirname(filePath);
14643
+ await mkdir13(dir, { recursive: true });
14644
+ await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
14010
14645
  }
14011
14646
  keyToPath(key) {
14012
14647
  const prefix = key.slice(0, 2);
14013
- return path38.join(this.cachePath, prefix, `${key}.json`);
14648
+ return path39.join(this.cachePath, prefix, `${key}.json`);
14014
14649
  }
14015
14650
  };
14016
14651
  function shouldEnableCache(params) {
@@ -14483,6 +15118,7 @@ export {
14483
15118
  OtelTraceExporter,
14484
15119
  OtlpJsonFileExporter,
14485
15120
  ProviderRegistry,
15121
+ RepoManager,
14486
15122
  ResponseCache,
14487
15123
  SimpleTraceFileExporter,
14488
15124
  TEST_MESSAGE_ROLES,
@@ -14568,12 +15204,19 @@ export {
14568
15204
  resolveTargetDefinition,
14569
15205
  resolveWorkspaceTemplate,
14570
15206
  rubricEvaluationSchema,
15207
+ runContainsAllAssertion,
15208
+ runContainsAnyAssertion,
14571
15209
  runContainsAssertion,
15210
+ runEndsWithAssertion,
14572
15211
  runEqualsAssertion,
14573
15212
  runEvalCase,
14574
15213
  runEvaluation,
15214
+ runIcontainsAllAssertion,
15215
+ runIcontainsAnyAssertion,
15216
+ runIcontainsAssertion,
14575
15217
  runIsJsonAssertion,
14576
15218
  runRegexAssertion,
15219
+ runStartsWithAssertion,
14577
15220
  scoreToVerdict,
14578
15221
  shouldEnableCache,
14579
15222
  shouldSkipCacheForTemperature,