@agentv/core 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1244,12 +1244,12 @@ function serializeAttributeValue(value) {
1244
1244
  if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
1245
1245
  return { stringValue: String(value) };
1246
1246
  }
1247
- var import_promises30, import_node_path42, OtlpJsonFileExporter;
1247
+ var import_promises31, import_node_path43, OtlpJsonFileExporter;
1248
1248
  var init_otlp_json_file_exporter = __esm({
1249
1249
  "src/observability/otlp-json-file-exporter.ts"() {
1250
1250
  "use strict";
1251
- import_promises30 = require("fs/promises");
1252
- import_node_path42 = require("path");
1251
+ import_promises31 = require("fs/promises");
1252
+ import_node_path43 = require("path");
1253
1253
  OtlpJsonFileExporter = class {
1254
1254
  // biome-ignore lint/suspicious/noExplicitAny: serialized span data
1255
1255
  spans = [];
@@ -1288,7 +1288,7 @@ var init_otlp_json_file_exporter = __esm({
1288
1288
  }
1289
1289
  async flush() {
1290
1290
  if (this.spans.length === 0) return;
1291
- await (0, import_promises30.mkdir)((0, import_node_path42.dirname)(this.filePath), { recursive: true });
1291
+ await (0, import_promises31.mkdir)((0, import_node_path43.dirname)(this.filePath), { recursive: true });
1292
1292
  const otlpJson = {
1293
1293
  resourceSpans: [
1294
1294
  {
@@ -1302,8 +1302,8 @@ var init_otlp_json_file_exporter = __esm({
1302
1302
  }
1303
1303
  ]
1304
1304
  };
1305
- const { writeFile: writeFile8 } = await import("fs/promises");
1306
- await writeFile8(this.filePath, JSON.stringify(otlpJson, null, 2));
1305
+ const { writeFile: writeFile9 } = await import("fs/promises");
1306
+ await writeFile9(this.filePath, JSON.stringify(otlpJson, null, 2));
1307
1307
  }
1308
1308
  };
1309
1309
  }
@@ -1319,13 +1319,13 @@ function hrTimeDiffMs(start, end) {
1319
1319
  const diffNano = end[1] - start[1];
1320
1320
  return Math.round(diffSec * 1e3 + diffNano / 1e6);
1321
1321
  }
1322
- var import_node_fs12, import_promises31, import_node_path43, SimpleTraceFileExporter;
1322
+ var import_node_fs13, import_promises32, import_node_path44, SimpleTraceFileExporter;
1323
1323
  var init_simple_trace_file_exporter = __esm({
1324
1324
  "src/observability/simple-trace-file-exporter.ts"() {
1325
1325
  "use strict";
1326
- import_node_fs12 = require("fs");
1327
- import_promises31 = require("fs/promises");
1328
- import_node_path43 = require("path");
1326
+ import_node_fs13 = require("fs");
1327
+ import_promises32 = require("fs/promises");
1328
+ import_node_path44 = require("path");
1329
1329
  SimpleTraceFileExporter = class {
1330
1330
  stream = null;
1331
1331
  filePath;
@@ -1338,8 +1338,8 @@ var init_simple_trace_file_exporter = __esm({
1338
1338
  async ensureStream() {
1339
1339
  if (!this.streamReady) {
1340
1340
  this.streamReady = (async () => {
1341
- await (0, import_promises31.mkdir)((0, import_node_path43.dirname)(this.filePath), { recursive: true });
1342
- this.stream = (0, import_node_fs12.createWriteStream)(this.filePath, { flags: "w" });
1341
+ await (0, import_promises32.mkdir)((0, import_node_path44.dirname)(this.filePath), { recursive: true });
1342
+ this.stream = (0, import_node_fs13.createWriteStream)(this.filePath, { flags: "w" });
1343
1343
  return this.stream;
1344
1344
  })();
1345
1345
  }
@@ -1448,6 +1448,7 @@ __export(index_exports, {
1448
1448
  OtelTraceExporter: () => OtelTraceExporter,
1449
1449
  OtlpJsonFileExporter: () => OtlpJsonFileExporter,
1450
1450
  ProviderRegistry: () => ProviderRegistry,
1451
+ RepoManager: () => RepoManager,
1451
1452
  ResponseCache: () => ResponseCache,
1452
1453
  SimpleTraceFileExporter: () => SimpleTraceFileExporter,
1453
1454
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
@@ -1533,12 +1534,19 @@ __export(index_exports, {
1533
1534
  resolveTargetDefinition: () => resolveTargetDefinition,
1534
1535
  resolveWorkspaceTemplate: () => resolveWorkspaceTemplate,
1535
1536
  rubricEvaluationSchema: () => rubricEvaluationSchema,
1537
+ runContainsAllAssertion: () => runContainsAllAssertion,
1538
+ runContainsAnyAssertion: () => runContainsAnyAssertion,
1536
1539
  runContainsAssertion: () => runContainsAssertion,
1540
+ runEndsWithAssertion: () => runEndsWithAssertion,
1537
1541
  runEqualsAssertion: () => runEqualsAssertion,
1538
1542
  runEvalCase: () => runEvalCase,
1539
1543
  runEvaluation: () => runEvaluation,
1544
+ runIcontainsAllAssertion: () => runIcontainsAllAssertion,
1545
+ runIcontainsAnyAssertion: () => runIcontainsAnyAssertion,
1546
+ runIcontainsAssertion: () => runIcontainsAssertion,
1540
1547
  runIsJsonAssertion: () => runIsJsonAssertion,
1541
1548
  runRegexAssertion: () => runRegexAssertion,
1549
+ runStartsWithAssertion: () => runStartsWithAssertion,
1542
1550
  scoreToVerdict: () => scoreToVerdict,
1543
1551
  shouldEnableCache: () => shouldEnableCache,
1544
1552
  shouldSkipCacheForTemperature: () => shouldSkipCacheForTemperature,
@@ -1615,6 +1623,13 @@ var EVALUATOR_KIND_VALUES = [
1615
1623
  "execution_metrics",
1616
1624
  "agent_judge",
1617
1625
  "contains",
1626
+ "contains_any",
1627
+ "contains_all",
1628
+ "icontains",
1629
+ "icontains_any",
1630
+ "icontains_all",
1631
+ "starts_with",
1632
+ "ends_with",
1618
1633
  "regex",
1619
1634
  "is_json",
1620
1635
  "equals",
@@ -2888,18 +2903,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2888
2903
  });
2889
2904
  continue;
2890
2905
  }
2906
+ if (typeValue === "contains_any" || typeValue === "contains_all") {
2907
+ const value = asStringArrayStrict(rawEvaluator.value);
2908
+ if (!value || value.length === 0) {
2909
+ logWarning2(
2910
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
2911
+ );
2912
+ continue;
2913
+ }
2914
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2915
+ const required2 = parseRequired(rawEvaluator.required);
2916
+ evaluators.push({
2917
+ name,
2918
+ type: typeValue,
2919
+ value,
2920
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
2921
+ ...required2 !== void 0 ? { required: required2 } : {},
2922
+ ...negate !== void 0 ? { negate } : {}
2923
+ });
2924
+ continue;
2925
+ }
2926
+ if (typeValue === "icontains") {
2927
+ const value = asString(rawEvaluator.value);
2928
+ if (!value) {
2929
+ logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
2930
+ continue;
2931
+ }
2932
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2933
+ const required2 = parseRequired(rawEvaluator.required);
2934
+ evaluators.push({
2935
+ name,
2936
+ type: "icontains",
2937
+ value,
2938
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
2939
+ ...required2 !== void 0 ? { required: required2 } : {},
2940
+ ...negate !== void 0 ? { negate } : {}
2941
+ });
2942
+ continue;
2943
+ }
2944
+ if (typeValue === "icontains_any" || typeValue === "icontains_all") {
2945
+ const value = asStringArrayStrict(rawEvaluator.value);
2946
+ if (!value || value.length === 0) {
2947
+ logWarning2(
2948
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
2949
+ );
2950
+ continue;
2951
+ }
2952
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2953
+ const required2 = parseRequired(rawEvaluator.required);
2954
+ evaluators.push({
2955
+ name,
2956
+ type: typeValue,
2957
+ value,
2958
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
2959
+ ...required2 !== void 0 ? { required: required2 } : {},
2960
+ ...negate !== void 0 ? { negate } : {}
2961
+ });
2962
+ continue;
2963
+ }
2964
+ if (typeValue === "starts_with" || typeValue === "ends_with") {
2965
+ const value = asString(rawEvaluator.value);
2966
+ if (!value) {
2967
+ logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
2968
+ continue;
2969
+ }
2970
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2971
+ const required2 = parseRequired(rawEvaluator.required);
2972
+ evaluators.push({
2973
+ name,
2974
+ type: typeValue,
2975
+ value,
2976
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
2977
+ ...required2 !== void 0 ? { required: required2 } : {},
2978
+ ...negate !== void 0 ? { negate } : {}
2979
+ });
2980
+ continue;
2981
+ }
2891
2982
  if (typeValue === "regex") {
2892
2983
  const value = asString(rawEvaluator.value);
2893
2984
  if (!value) {
2894
2985
  logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
2895
2986
  continue;
2896
2987
  }
2988
+ const flags = asString(rawEvaluator.flags);
2897
2989
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2898
2990
  const required2 = parseRequired(rawEvaluator.required);
2899
2991
  evaluators.push({
2900
2992
  name,
2901
2993
  type: "regex",
2902
2994
  value,
2995
+ ...flags !== void 0 ? { flags } : {},
2903
2996
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2904
2997
  ...required2 !== void 0 ? { required: required2 } : {},
2905
2998
  ...negate !== void 0 ? { negate } : {}
@@ -3072,15 +3165,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3072
3165
  }
3073
3166
  return evaluators.length > 0 ? evaluators : void 0;
3074
3167
  }
3075
- var ASSERTION_TYPES = /* @__PURE__ */ new Set(["contains", "regex", "is_json", "equals", "rubrics"]);
3168
+ var ASSERTION_TYPES = /* @__PURE__ */ new Set([
3169
+ "contains",
3170
+ "contains_any",
3171
+ "contains_all",
3172
+ "icontains",
3173
+ "icontains_any",
3174
+ "icontains_all",
3175
+ "starts_with",
3176
+ "ends_with",
3177
+ "regex",
3178
+ "is_json",
3179
+ "equals",
3180
+ "rubrics"
3181
+ ]);
3076
3182
  function generateAssertionName(typeValue, rawEvaluator) {
3077
3183
  if (!ASSERTION_TYPES.has(typeValue)) {
3078
3184
  return void 0;
3079
3185
  }
3080
3186
  const value = asString(rawEvaluator.value);
3187
+ const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
3081
3188
  switch (typeValue) {
3082
3189
  case "contains":
3083
3190
  return value ? `contains-${value}` : "contains";
3191
+ case "contains_any":
3192
+ return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
3193
+ case "contains_all":
3194
+ return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
3195
+ case "icontains":
3196
+ return value ? `icontains-${value}` : "icontains";
3197
+ case "icontains_any":
3198
+ return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
3199
+ case "icontains_all":
3200
+ return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
3201
+ case "starts_with":
3202
+ return value ? `starts_with-${value}` : "starts_with";
3203
+ case "ends_with":
3204
+ return value ? `ends_with-${value}` : "ends_with";
3084
3205
  case "regex":
3085
3206
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
3086
3207
  case "is_json":
@@ -3106,6 +3227,13 @@ function coerceEvaluator(candidate, contextId) {
3106
3227
  function asString(value) {
3107
3228
  return typeof value === "string" ? value : void 0;
3108
3229
  }
3230
+ function asStringArrayStrict(value) {
3231
+ if (!Array.isArray(value)) {
3232
+ return void 0;
3233
+ }
3234
+ const result = value.filter((v) => typeof v === "string");
3235
+ return result.length > 0 ? result : void 0;
3236
+ }
3109
3237
  function asStringArray(value, description) {
3110
3238
  if (value === void 0) {
3111
3239
  return void 0;
@@ -4423,6 +4551,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
4423
4551
  }
4424
4552
  return cwd ? { ...config, cwd } : config;
4425
4553
  }
4554
+ function parseRepoSource(raw) {
4555
+ if (!isJsonObject(raw)) return void 0;
4556
+ const obj = raw;
4557
+ if (obj.type === "git" && typeof obj.url === "string") {
4558
+ return { type: "git", url: obj.url };
4559
+ }
4560
+ if (obj.type === "local" && typeof obj.path === "string") {
4561
+ return { type: "local", path: obj.path };
4562
+ }
4563
+ return void 0;
4564
+ }
4565
+ function parseRepoCheckout(raw) {
4566
+ if (!isJsonObject(raw)) return void 0;
4567
+ const obj = raw;
4568
+ const ref = typeof obj.ref === "string" ? obj.ref : void 0;
4569
+ const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
4570
+ const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
4571
+ if (!ref && !resolve && ancestor === void 0) return void 0;
4572
+ return {
4573
+ ...ref !== void 0 && { ref },
4574
+ ...resolve !== void 0 && { resolve },
4575
+ ...ancestor !== void 0 && { ancestor }
4576
+ };
4577
+ }
4578
+ function parseRepoClone(raw) {
4579
+ if (!isJsonObject(raw)) return void 0;
4580
+ const obj = raw;
4581
+ const depth = typeof obj.depth === "number" ? obj.depth : void 0;
4582
+ const filter = typeof obj.filter === "string" ? obj.filter : void 0;
4583
+ const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
4584
+ if (depth === void 0 && !filter && !sparse) return void 0;
4585
+ return {
4586
+ ...depth !== void 0 && { depth },
4587
+ ...filter !== void 0 && { filter },
4588
+ ...sparse !== void 0 && { sparse }
4589
+ };
4590
+ }
4591
+ function parseRepoConfig(raw) {
4592
+ if (!isJsonObject(raw)) return void 0;
4593
+ const obj = raw;
4594
+ const repoPath = typeof obj.path === "string" ? obj.path : void 0;
4595
+ const source = parseRepoSource(obj.source);
4596
+ if (!repoPath || !source) return void 0;
4597
+ const checkout = parseRepoCheckout(obj.checkout);
4598
+ const clone = parseRepoClone(obj.clone);
4599
+ return {
4600
+ path: repoPath,
4601
+ source,
4602
+ ...checkout !== void 0 && { checkout },
4603
+ ...clone !== void 0 && { clone }
4604
+ };
4605
+ }
4606
+ function parseResetConfig(raw) {
4607
+ if (!isJsonObject(raw)) return void 0;
4608
+ const obj = raw;
4609
+ const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
4610
+ const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
4611
+ if (!strategy && afterEach === void 0) return void 0;
4612
+ return {
4613
+ ...strategy !== void 0 && { strategy },
4614
+ ...afterEach !== void 0 && { after_each: afterEach }
4615
+ };
4616
+ }
4426
4617
  function parseWorkspaceConfig(raw, evalFileDir) {
4427
4618
  if (!isJsonObject(raw)) return void 0;
4428
4619
  const obj = raw;
@@ -4430,13 +4621,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
4430
4621
  if (template && !import_node_path8.default.isAbsolute(template)) {
4431
4622
  template = import_node_path8.default.resolve(evalFileDir, template);
4432
4623
  }
4624
+ const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
4625
+ const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
4626
+ const reset = parseResetConfig(obj.reset);
4433
4627
  const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
4434
4628
  const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
4435
4629
  const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
4436
4630
  const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
4437
- if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach) return void 0;
4631
+ if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
4632
+ return void 0;
4438
4633
  return {
4439
4634
  ...template !== void 0 && { template },
4635
+ ...isolation !== void 0 && { isolation },
4636
+ ...repos !== void 0 && { repos },
4637
+ ...reset !== void 0 && { reset },
4440
4638
  ...beforeAll !== void 0 && { before_all: beforeAll },
4441
4639
  ...afterAll !== void 0 && { after_all: afterAll },
4442
4640
  ...beforeEach !== void 0 && { before_each: beforeEach },
@@ -4449,6 +4647,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
4449
4647
  if (!caseLevel) return suiteLevel;
4450
4648
  return {
4451
4649
  template: caseLevel.template ?? suiteLevel.template,
4650
+ isolation: caseLevel.isolation ?? suiteLevel.isolation,
4651
+ repos: caseLevel.repos ?? suiteLevel.repos,
4652
+ reset: caseLevel.reset ?? suiteLevel.reset,
4452
4653
  before_all: caseLevel.before_all ?? suiteLevel.before_all,
4453
4654
  after_all: caseLevel.after_all ?? suiteLevel.after_all,
4454
4655
  before_each: caseLevel.before_each ?? suiteLevel.before_each,
@@ -5103,11 +5304,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
5103
5304
  }
5104
5305
  return claudeSdkModule;
5105
5306
  }
5106
- var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
5107
- - Do NOT create any additional output files in the workspace.
5108
- - All intended file outputs/changes MUST be written in your response.
5109
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
5110
- This is required for evaluation scoring.`;
5111
5307
  var ClaudeProvider = class {
5112
5308
  id;
5113
5309
  kind = "claude";
@@ -5129,7 +5325,7 @@ var ClaudeProvider = class {
5129
5325
  const logger = await this.createStreamLogger(request).catch(() => void 0);
5130
5326
  const inputFiles = normalizeInputFiles(request.inputFiles);
5131
5327
  const prompt = buildPromptDocument(request, inputFiles);
5132
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT2);
5328
+ const systemPrompt = this.config.systemPrompt;
5133
5329
  const queryOptions = {
5134
5330
  permissionMode: "bypassPermissions",
5135
5331
  allowDangerouslySkipPermissions: true,
@@ -6110,11 +6306,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
6110
6306
  }
6111
6307
  return codexSdkModule;
6112
6308
  }
6113
- var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
6114
- - Do NOT create any additional output files in the workspace.
6115
- - All intended file outputs/changes MUST be written in your response.
6116
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
6117
- This is required for evaluation scoring.`;
6118
6309
  var CodexProvider = class {
6119
6310
  id;
6120
6311
  kind = "codex";
@@ -6149,7 +6340,7 @@ var CodexProvider = class {
6149
6340
  const thread = codex.startThread(threadOptions);
6150
6341
  const inputFiles = normalizeInputFiles(request.inputFiles);
6151
6342
  const basePrompt = buildPromptDocument(request, inputFiles);
6152
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT3);
6343
+ const systemPrompt = this.config.systemPrompt;
6153
6344
  const prompt = systemPrompt ? `${systemPrompt}
6154
6345
 
6155
6346
  ${basePrompt}` : basePrompt;
@@ -6516,7 +6707,7 @@ var import_node_path14 = __toESM(require("path"), 1);
6516
6707
  var import_node_url2 = require("url");
6517
6708
  var import_meta = {};
6518
6709
  function resolvePlatformCliPath() {
6519
- const os4 = (0, import_node_os2.platform)();
6710
+ const os5 = (0, import_node_os2.platform)();
6520
6711
  const cpu = (0, import_node_os2.arch)();
6521
6712
  const platformMap = {
6522
6713
  linux: "linux",
@@ -6527,13 +6718,13 @@ function resolvePlatformCliPath() {
6527
6718
  x64: "x64",
6528
6719
  arm64: "arm64"
6529
6720
  };
6530
- const osPart = platformMap[os4];
6721
+ const osPart = platformMap[os5];
6531
6722
  const archPart = archMap[cpu];
6532
6723
  if (!osPart || !archPart) {
6533
6724
  return void 0;
6534
6725
  }
6535
6726
  const packageName = `@github/copilot-${osPart}-${archPart}`;
6536
- const binaryName = os4 === "win32" ? "copilot.exe" : "copilot";
6727
+ const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
6537
6728
  try {
6538
6729
  const resolved = import_meta.resolve(`${packageName}/package.json`);
6539
6730
  const packageJsonPath = resolved.startsWith("file:") ? (0, import_node_url2.fileURLToPath)(resolved) : resolved;
@@ -6675,11 +6866,6 @@ function isLogStreamingDisabled(envKey) {
6675
6866
  }
6676
6867
 
6677
6868
  // src/evaluation/providers/copilot-cli.ts
6678
- var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
6679
- - Do NOT create any additional output files in the workspace.
6680
- - All intended file outputs/changes MUST be written in your response.
6681
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
6682
- This is required for evaluation scoring.`;
6683
6869
  var CopilotCliProvider = class {
6684
6870
  id;
6685
6871
  kind = "copilot-cli";
@@ -6882,8 +7068,8 @@ var CopilotCliProvider = class {
6882
7068
  }
6883
7069
  return args;
6884
7070
  }
6885
- resolveSystemPrompt(request) {
6886
- return this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT4);
7071
+ resolveSystemPrompt(_request) {
7072
+ return this.config.systemPrompt;
6887
7073
  }
6888
7074
  async raceWithTimeout(sendPromise, agentProcess) {
6889
7075
  const timeoutMs = this.config.timeoutMs;
@@ -7071,21 +7257,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
7071
7257
  }
7072
7258
  return copilotSdkModule;
7073
7259
  }
7074
- var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
7075
- - Do NOT create any additional output files in the workspace.
7076
- - All intended file outputs/changes MUST be written in your response.
7077
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
7078
- This is required for evaluation scoring.`;
7079
7260
  var CopilotSdkProvider = class {
7080
7261
  id;
7081
- kind = "copilot";
7262
+ kind = "copilot-sdk";
7082
7263
  targetName;
7083
7264
  supportsBatch = false;
7084
7265
  config;
7085
7266
  // biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
7086
7267
  client = null;
7087
7268
  constructor(targetName, config) {
7088
- this.id = `copilot:${targetName}`;
7269
+ this.id = `copilot-sdk:${targetName}`;
7089
7270
  this.targetName = targetName;
7090
7271
  this.config = config;
7091
7272
  }
@@ -7108,7 +7289,7 @@ var CopilotSdkProvider = class {
7108
7289
  if (cwd) {
7109
7290
  sessionOptions.workingDirectory = cwd;
7110
7291
  }
7111
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT5);
7292
+ const systemPrompt = this.config.systemPrompt;
7112
7293
  if (systemPrompt) {
7113
7294
  sessionOptions.systemMessage = {
7114
7295
  mode: "append",
@@ -7624,11 +7805,6 @@ function subscribeToPiLogEntries(listener) {
7624
7805
  // src/evaluation/providers/pi-coding-agent.ts
7625
7806
  var WORKSPACE_PREFIX = "agentv-pi-";
7626
7807
  var PROMPT_FILENAME = "prompt.md";
7627
- var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
7628
- - Do NOT create any additional output files in the workspace.
7629
- - All intended file outputs/changes MUST be written in your response.
7630
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
7631
- This is required for evaluation scoring.`;
7632
7808
  var PiCodingAgentProvider = class {
7633
7809
  id;
7634
7810
  kind = "pi-coding-agent";
@@ -7705,7 +7881,7 @@ var PiCodingAgentProvider = class {
7705
7881
  }
7706
7882
  return import_node_path17.default.resolve(this.config.cwd);
7707
7883
  }
7708
- buildPiArgs(prompt, inputFiles, captureFileChanges2) {
7884
+ buildPiArgs(prompt, inputFiles, _captureFileChanges) {
7709
7885
  const args = [];
7710
7886
  if (this.config.provider) {
7711
7887
  args.push("--provider", this.config.provider);
@@ -7733,7 +7909,7 @@ var PiCodingAgentProvider = class {
7733
7909
  args.push(`@${file}`);
7734
7910
  }
7735
7911
  }
7736
- const systemPrompt = this.config.systemPrompt ?? (captureFileChanges2 ? void 0 : DEFAULT_SYSTEM_PROMPT6);
7912
+ const systemPrompt = this.config.systemPrompt;
7737
7913
  const fullPrompt = systemPrompt ? `${systemPrompt}
7738
7914
 
7739
7915
  ${prompt}` : prompt;
@@ -8604,17 +8780,17 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
8604
8780
  providerBatching,
8605
8781
  config: resolveCodexConfig(parsed, env, evalFilePath)
8606
8782
  };
8607
- case "copilot":
8608
8783
  case "copilot-sdk":
8609
8784
  case "copilot_sdk":
8610
8785
  return {
8611
- kind: "copilot",
8786
+ kind: "copilot-sdk",
8612
8787
  name: parsed.name,
8613
8788
  judgeTarget: parsed.judge_target,
8614
8789
  workers: parsed.workers,
8615
8790
  providerBatching,
8616
8791
  config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
8617
8792
  };
8793
+ case "copilot":
8618
8794
  case "copilot-cli":
8619
8795
  return {
8620
8796
  kind: "copilot-cli",
@@ -9225,8 +9401,8 @@ function resolveCliConfig(target, env, evalFilePath) {
9225
9401
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
9226
9402
  if (!parseResult.success) {
9227
9403
  const firstError = parseResult.error.errors[0];
9228
- const path41 = firstError?.path.join(".") || "";
9229
- const prefix = path41 ? `${target.name} ${path41}: ` : `${target.name}: `;
9404
+ const path42 = firstError?.path.join(".") || "";
9405
+ const prefix = path42 ? `${target.name} ${path42}: ` : `${target.name}: `;
9230
9406
  throw new Error(`${prefix}${firstError?.message}`);
9231
9407
  }
9232
9408
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -10523,7 +10699,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
10523
10699
 
10524
10700
  **IMPORTANT**: Follow these exact steps:
10525
10701
  1. Create and write your complete response to: {{responseFileTmp}}
10526
- - Do NOT create any additional output files in the workspace.
10527
10702
  - All intended file outputs/changes MUST be written in your response file.
10528
10703
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
10529
10704
  2. When completely finished, run these PowerShell commands to signal completion:
@@ -10542,7 +10717,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
10542
10717
 
10543
10718
  **IMPORTANT**: Follow these exact steps:
10544
10719
  1. Create and write your complete response to: {{responseFileTmp}}
10545
- - Do NOT create any additional output files in the workspace.
10546
10720
  - All intended file outputs/changes MUST be written in your response file.
10547
10721
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
10548
10722
  2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
@@ -10968,7 +11142,7 @@ async function discoverProviders(registry, baseDir) {
10968
11142
  // src/evaluation/providers/index.ts
10969
11143
  function createBuiltinProviderRegistry() {
10970
11144
  const registry = new ProviderRegistry();
10971
- registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
11145
+ registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
10972
11146
  "vscode-insiders",
10973
11147
  (t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
10974
11148
  );
@@ -11157,16 +11331,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
11157
11331
  });
11158
11332
  }
11159
11333
  async function execShellWithStdin(command, stdinPayload, options = {}) {
11160
- const { mkdir: mkdir15, readFile: readFile13, rm: rm5, writeFile: writeFile8 } = await import("fs/promises");
11334
+ const { mkdir: mkdir16, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
11161
11335
  const { tmpdir: tmpdir3 } = await import("os");
11162
- const path41 = await import("path");
11336
+ const path42 = await import("path");
11163
11337
  const { randomUUID: randomUUID8 } = await import("crypto");
11164
- const dir = path41.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
11165
- await mkdir15(dir, { recursive: true });
11166
- const stdinPath = path41.join(dir, "stdin.txt");
11167
- const stdoutPath = path41.join(dir, "stdout.txt");
11168
- const stderrPath = path41.join(dir, "stderr.txt");
11169
- await writeFile8(stdinPath, stdinPayload, "utf8");
11338
+ const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
11339
+ await mkdir16(dir, { recursive: true });
11340
+ const stdinPath = path42.join(dir, "stdin.txt");
11341
+ const stdoutPath = path42.join(dir, "stdout.txt");
11342
+ const stderrPath = path42.join(dir, "stderr.txt");
11343
+ await writeFile9(stdinPath, stdinPayload, "utf8");
11170
11344
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
11171
11345
  const { spawn: spawn4 } = await import("child_process");
11172
11346
  try {
@@ -11199,7 +11373,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
11199
11373
  const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
11200
11374
  return { stdout, stderr, exitCode };
11201
11375
  } finally {
11202
- await rm5(dir, { recursive: true, force: true });
11376
+ await rm6(dir, { recursive: true, force: true });
11203
11377
  }
11204
11378
  }
11205
11379
 
@@ -11517,7 +11691,7 @@ var CodeEvaluator = class {
11517
11691
  outputPath,
11518
11692
  guidelineFiles: context2.evalCase.guideline_paths,
11519
11693
  inputFiles: context2.evalCase.file_paths.filter(
11520
- (path41) => !context2.evalCase.guideline_paths.includes(path41)
11694
+ (path42) => !context2.evalCase.guideline_paths.includes(path42)
11521
11695
  ),
11522
11696
  input: context2.evalCase.input,
11523
11697
  trace: context2.trace ?? null,
@@ -11648,7 +11822,7 @@ var import_ai3 = require("ai");
11648
11822
  // src/evaluation/providers/types.ts
11649
11823
  var AGENT_PROVIDER_KINDS = [
11650
11824
  "codex",
11651
- "copilot",
11825
+ "copilot-sdk",
11652
11826
  "copilot-cli",
11653
11827
  "pi-coding-agent",
11654
11828
  "claude",
@@ -11794,13 +11968,15 @@ ${context2.fileChanges}`;
11794
11968
  evaluatorRawRequest,
11795
11969
  tokenUsage
11796
11970
  };
11797
- } catch {
11971
+ } catch (e) {
11972
+ const message = e instanceof Error ? e.message : String(e);
11798
11973
  return {
11799
11974
  score: 0,
11800
- verdict: "fail",
11975
+ verdict: "skip",
11801
11976
  hits: [],
11802
- misses: [],
11977
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
11803
11978
  expectedAspectCount: 1,
11979
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
11804
11980
  evaluatorRawRequest
11805
11981
  };
11806
11982
  }
@@ -12742,115 +12918,115 @@ var FieldAccuracyEvaluator = class {
12742
12918
  * Evaluate a single field against the expected value.
12743
12919
  */
12744
12920
  evaluateField(fieldConfig, candidateData, expectedData) {
12745
- const { path: path41, match, required = true, weight = 1 } = fieldConfig;
12746
- const candidateValue = resolvePath(candidateData, path41);
12747
- const expectedValue = resolvePath(expectedData, path41);
12921
+ const { path: path42, match, required = true, weight = 1 } = fieldConfig;
12922
+ const candidateValue = resolvePath(candidateData, path42);
12923
+ const expectedValue = resolvePath(expectedData, path42);
12748
12924
  if (expectedValue === void 0) {
12749
12925
  return {
12750
- path: path41,
12926
+ path: path42,
12751
12927
  score: 1,
12752
12928
  // No expected value means no comparison needed
12753
12929
  weight,
12754
12930
  hit: true,
12755
- message: `${path41}: no expected value`
12931
+ message: `${path42}: no expected value`
12756
12932
  };
12757
12933
  }
12758
12934
  if (candidateValue === void 0) {
12759
12935
  if (required) {
12760
12936
  return {
12761
- path: path41,
12937
+ path: path42,
12762
12938
  score: 0,
12763
12939
  weight,
12764
12940
  hit: false,
12765
- message: `${path41} (required, missing)`
12941
+ message: `${path42} (required, missing)`
12766
12942
  };
12767
12943
  }
12768
12944
  return {
12769
- path: path41,
12945
+ path: path42,
12770
12946
  score: 1,
12771
12947
  // Don't penalize missing optional fields
12772
12948
  weight: 0,
12773
12949
  // Zero weight means it won't affect the score
12774
12950
  hit: true,
12775
- message: `${path41}: optional field missing`
12951
+ message: `${path42}: optional field missing`
12776
12952
  };
12777
12953
  }
12778
12954
  switch (match) {
12779
12955
  case "exact":
12780
- return this.compareExact(path41, candidateValue, expectedValue, weight);
12956
+ return this.compareExact(path42, candidateValue, expectedValue, weight);
12781
12957
  case "numeric_tolerance":
12782
12958
  return this.compareNumericTolerance(
12783
- path41,
12959
+ path42,
12784
12960
  candidateValue,
12785
12961
  expectedValue,
12786
12962
  fieldConfig,
12787
12963
  weight
12788
12964
  );
12789
12965
  case "date":
12790
- return this.compareDate(path41, candidateValue, expectedValue, fieldConfig, weight);
12966
+ return this.compareDate(path42, candidateValue, expectedValue, fieldConfig, weight);
12791
12967
  default:
12792
12968
  return {
12793
- path: path41,
12969
+ path: path42,
12794
12970
  score: 0,
12795
12971
  weight,
12796
12972
  hit: false,
12797
- message: `${path41}: unknown match type "${match}"`
12973
+ message: `${path42}: unknown match type "${match}"`
12798
12974
  };
12799
12975
  }
12800
12976
  }
12801
12977
  /**
12802
12978
  * Exact equality comparison.
12803
12979
  */
12804
- compareExact(path41, candidateValue, expectedValue, weight) {
12980
+ compareExact(path42, candidateValue, expectedValue, weight) {
12805
12981
  if (deepEqual(candidateValue, expectedValue)) {
12806
12982
  return {
12807
- path: path41,
12983
+ path: path42,
12808
12984
  score: 1,
12809
12985
  weight,
12810
12986
  hit: true,
12811
- message: path41
12987
+ message: path42
12812
12988
  };
12813
12989
  }
12814
12990
  if (typeof candidateValue !== typeof expectedValue) {
12815
12991
  return {
12816
- path: path41,
12992
+ path: path42,
12817
12993
  score: 0,
12818
12994
  weight,
12819
12995
  hit: false,
12820
- message: `${path41} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12996
+ message: `${path42} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12821
12997
  };
12822
12998
  }
12823
12999
  return {
12824
- path: path41,
13000
+ path: path42,
12825
13001
  score: 0,
12826
13002
  weight,
12827
13003
  hit: false,
12828
- message: `${path41} (value mismatch)`
13004
+ message: `${path42} (value mismatch)`
12829
13005
  };
12830
13006
  }
12831
13007
  /**
12832
13008
  * Numeric comparison with absolute or relative tolerance.
12833
13009
  */
12834
- compareNumericTolerance(path41, candidateValue, expectedValue, fieldConfig, weight) {
13010
+ compareNumericTolerance(path42, candidateValue, expectedValue, fieldConfig, weight) {
12835
13011
  const { tolerance = 0, relative = false } = fieldConfig;
12836
13012
  const candidateNum = toNumber2(candidateValue);
12837
13013
  const expectedNum = toNumber2(expectedValue);
12838
13014
  if (candidateNum === null || expectedNum === null) {
12839
13015
  return {
12840
- path: path41,
13016
+ path: path42,
12841
13017
  score: 0,
12842
13018
  weight,
12843
13019
  hit: false,
12844
- message: `${path41} (non-numeric value)`
13020
+ message: `${path42} (non-numeric value)`
12845
13021
  };
12846
13022
  }
12847
13023
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12848
13024
  return {
12849
- path: path41,
13025
+ path: path42,
12850
13026
  score: 0,
12851
13027
  weight,
12852
13028
  hit: false,
12853
- message: `${path41} (invalid numeric value)`
13029
+ message: `${path42} (invalid numeric value)`
12854
13030
  };
12855
13031
  }
12856
13032
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12863,61 +13039,61 @@ var FieldAccuracyEvaluator = class {
12863
13039
  }
12864
13040
  if (withinTolerance) {
12865
13041
  return {
12866
- path: path41,
13042
+ path: path42,
12867
13043
  score: 1,
12868
13044
  weight,
12869
13045
  hit: true,
12870
- message: `${path41} (within tolerance: diff=${diff.toFixed(2)})`
13046
+ message: `${path42} (within tolerance: diff=${diff.toFixed(2)})`
12871
13047
  };
12872
13048
  }
12873
13049
  return {
12874
- path: path41,
13050
+ path: path42,
12875
13051
  score: 0,
12876
13052
  weight,
12877
13053
  hit: false,
12878
- message: `${path41} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13054
+ message: `${path42} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12879
13055
  };
12880
13056
  }
12881
13057
  /**
12882
13058
  * Date comparison with format normalization.
12883
13059
  */
12884
- compareDate(path41, candidateValue, expectedValue, fieldConfig, weight) {
13060
+ compareDate(path42, candidateValue, expectedValue, fieldConfig, weight) {
12885
13061
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12886
13062
  const candidateDate = parseDate(String(candidateValue), formats);
12887
13063
  const expectedDate = parseDate(String(expectedValue), formats);
12888
13064
  if (candidateDate === null) {
12889
13065
  return {
12890
- path: path41,
13066
+ path: path42,
12891
13067
  score: 0,
12892
13068
  weight,
12893
13069
  hit: false,
12894
- message: `${path41} (unparseable candidate date)`
13070
+ message: `${path42} (unparseable candidate date)`
12895
13071
  };
12896
13072
  }
12897
13073
  if (expectedDate === null) {
12898
13074
  return {
12899
- path: path41,
13075
+ path: path42,
12900
13076
  score: 0,
12901
13077
  weight,
12902
13078
  hit: false,
12903
- message: `${path41} (unparseable expected date)`
13079
+ message: `${path42} (unparseable expected date)`
12904
13080
  };
12905
13081
  }
12906
13082
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12907
13083
  return {
12908
- path: path41,
13084
+ path: path42,
12909
13085
  score: 1,
12910
13086
  weight,
12911
13087
  hit: true,
12912
- message: path41
13088
+ message: path42
12913
13089
  };
12914
13090
  }
12915
13091
  return {
12916
- path: path41,
13092
+ path: path42,
12917
13093
  score: 0,
12918
13094
  weight,
12919
13095
  hit: false,
12920
- message: `${path41} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13096
+ message: `${path42} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12921
13097
  };
12922
13098
  }
12923
13099
  /**
@@ -12958,11 +13134,11 @@ var FieldAccuracyEvaluator = class {
12958
13134
  };
12959
13135
  }
12960
13136
  };
12961
- function resolvePath(obj, path41) {
12962
- if (!path41 || !obj) {
13137
+ function resolvePath(obj, path42) {
13138
+ if (!path42 || !obj) {
12963
13139
  return void 0;
12964
13140
  }
12965
- const parts = path41.split(/\.|\[|\]/).filter((p) => p.length > 0);
13141
+ const parts = path42.split(/\.|\[|\]/).filter((p) => p.length > 0);
12966
13142
  let current = obj;
12967
13143
  for (const part of parts) {
12968
13144
  if (current === null || current === void 0) {
@@ -13780,8 +13956,8 @@ var TokenUsageEvaluator = class {
13780
13956
  };
13781
13957
 
13782
13958
  // src/evaluation/evaluators/tool-trajectory.ts
13783
- function getNestedValue(obj, path41) {
13784
- const parts = path41.split(".");
13959
+ function getNestedValue(obj, path42) {
13960
+ const parts = path42.split(".");
13785
13961
  let current = obj;
13786
13962
  for (const part of parts) {
13787
13963
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -14245,13 +14421,78 @@ function runContainsAssertion(output, value) {
14245
14421
  misses: passed ? [] : [`Output does not contain "${value}"`]
14246
14422
  };
14247
14423
  }
14248
- function runRegexAssertion(output, pattern) {
14249
- const regex = new RegExp(pattern);
14424
+ function runContainsAnyAssertion(output, values) {
14425
+ const matched = values.filter((v) => output.includes(v));
14426
+ const passed = matched.length > 0;
14427
+ return {
14428
+ score: passed ? 1 : 0,
14429
+ hits: passed ? [`Output contains "${matched[0]}"`] : [],
14430
+ misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
14431
+ };
14432
+ }
14433
+ function runContainsAllAssertion(output, values) {
14434
+ const missing = values.filter((v) => !output.includes(v));
14435
+ const passed = missing.length === 0;
14436
+ return {
14437
+ score: passed ? 1 : 0,
14438
+ hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
14439
+ misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
14440
+ };
14441
+ }
14442
+ function runIcontainsAssertion(output, value) {
14443
+ const passed = output.toLowerCase().includes(value.toLowerCase());
14444
+ return {
14445
+ score: passed ? 1 : 0,
14446
+ hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
14447
+ misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
14448
+ };
14449
+ }
14450
+ function runIcontainsAnyAssertion(output, values) {
14451
+ const lower = output.toLowerCase();
14452
+ const matched = values.filter((v) => lower.includes(v.toLowerCase()));
14453
+ const passed = matched.length > 0;
14454
+ return {
14455
+ score: passed ? 1 : 0,
14456
+ hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
14457
+ misses: passed ? [] : [
14458
+ `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
14459
+ ]
14460
+ };
14461
+ }
14462
+ function runIcontainsAllAssertion(output, values) {
14463
+ const lower = output.toLowerCase();
14464
+ const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
14465
+ const passed = missing.length === 0;
14466
+ return {
14467
+ score: passed ? 1 : 0,
14468
+ hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
14469
+ misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
14470
+ };
14471
+ }
14472
+ function runStartsWithAssertion(output, value) {
14473
+ const passed = output.trim().startsWith(value.trim());
14474
+ return {
14475
+ score: passed ? 1 : 0,
14476
+ hits: passed ? [`Output starts with "${value}"`] : [],
14477
+ misses: passed ? [] : [`Output does not start with "${value}"`]
14478
+ };
14479
+ }
14480
+ function runEndsWithAssertion(output, value) {
14481
+ const passed = output.trim().endsWith(value.trim());
14482
+ return {
14483
+ score: passed ? 1 : 0,
14484
+ hits: passed ? [`Output ends with "${value}"`] : [],
14485
+ misses: passed ? [] : [`Output does not end with "${value}"`]
14486
+ };
14487
+ }
14488
+ function runRegexAssertion(output, pattern, flags) {
14489
+ const regex = new RegExp(pattern, flags);
14250
14490
  const passed = regex.test(output);
14491
+ const flagsLabel = flags ? ` (flags: ${flags})` : "";
14251
14492
  return {
14252
14493
  score: passed ? 1 : 0,
14253
- hits: passed ? [`Output matches pattern /${pattern}/`] : [],
14254
- misses: passed ? [] : [`Output does not match pattern /${pattern}/`]
14494
+ hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
14495
+ misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
14255
14496
  };
14256
14497
  }
14257
14498
  function runIsJsonAssertion(output) {
@@ -14277,9 +14518,9 @@ function runEqualsAssertion(output, value) {
14277
14518
  }
14278
14519
 
14279
14520
  // src/evaluation/orchestrator.ts
14280
- var import_node_crypto8 = require("crypto");
14281
- var import_promises28 = require("fs/promises");
14282
- var import_node_path39 = __toESM(require("path"), 1);
14521
+ var import_node_crypto9 = require("crypto");
14522
+ var import_promises29 = require("fs/promises");
14523
+ var import_node_path40 = __toESM(require("path"), 1);
14283
14524
  var import_micromatch4 = __toESM(require("micromatch"), 1);
14284
14525
 
14285
14526
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -14669,13 +14910,13 @@ var containsFactory = (config) => {
14669
14910
  var regexFactory = (config) => {
14670
14911
  const c = config;
14671
14912
  return new DeterministicAssertionEvaluator("regex", (ctx) => {
14672
- const result = runRegexAssertion(ctx.candidate, c.value);
14913
+ const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
14673
14914
  return {
14674
14915
  score: result.score,
14675
14916
  verdict: result.score === 1 ? "pass" : "fail",
14676
14917
  hits: result.hits,
14677
14918
  misses: result.misses,
14678
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/` : `Output does not match pattern /${c.value}/`,
14919
+ reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
14679
14920
  expectedAspectCount: 1
14680
14921
  };
14681
14922
  });
@@ -14707,9 +14948,107 @@ var equalsFactory = (config) => {
14707
14948
  };
14708
14949
  });
14709
14950
  };
14951
+ var containsAnyFactory = (config) => {
14952
+ const c = config;
14953
+ return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
14954
+ const result = runContainsAnyAssertion(ctx.candidate, c.value);
14955
+ return {
14956
+ score: result.score,
14957
+ verdict: result.score === 1 ? "pass" : "fail",
14958
+ hits: result.hits,
14959
+ misses: result.misses,
14960
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
14961
+ expectedAspectCount: 1
14962
+ };
14963
+ });
14964
+ };
14965
+ var containsAllFactory = (config) => {
14966
+ const c = config;
14967
+ return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
14968
+ const result = runContainsAllAssertion(ctx.candidate, c.value);
14969
+ return {
14970
+ score: result.score,
14971
+ verdict: result.score === 1 ? "pass" : "fail",
14972
+ hits: result.hits,
14973
+ misses: result.misses,
14974
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
14975
+ expectedAspectCount: 1
14976
+ };
14977
+ });
14978
+ };
14979
+ var icontainsFactory = (config) => {
14980
+ const c = config;
14981
+ return new DeterministicAssertionEvaluator("icontains", (ctx) => {
14982
+ const result = runIcontainsAssertion(ctx.candidate, c.value);
14983
+ return {
14984
+ score: result.score,
14985
+ verdict: result.score === 1 ? "pass" : "fail",
14986
+ hits: result.hits,
14987
+ misses: result.misses,
14988
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
14989
+ expectedAspectCount: 1
14990
+ };
14991
+ });
14992
+ };
14993
+ var icontainsAnyFactory = (config) => {
14994
+ const c = config;
14995
+ return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
14996
+ const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
14997
+ return {
14998
+ score: result.score,
14999
+ verdict: result.score === 1 ? "pass" : "fail",
15000
+ hits: result.hits,
15001
+ misses: result.misses,
15002
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15003
+ expectedAspectCount: 1
15004
+ };
15005
+ });
15006
+ };
15007
+ var icontainsAllFactory = (config) => {
15008
+ const c = config;
15009
+ return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
15010
+ const result = runIcontainsAllAssertion(ctx.candidate, c.value);
15011
+ return {
15012
+ score: result.score,
15013
+ verdict: result.score === 1 ? "pass" : "fail",
15014
+ hits: result.hits,
15015
+ misses: result.misses,
15016
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15017
+ expectedAspectCount: 1
15018
+ };
15019
+ });
15020
+ };
15021
+ var startsWithFactory = (config) => {
15022
+ const c = config;
15023
+ return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
15024
+ const result = runStartsWithAssertion(ctx.candidate, c.value);
15025
+ return {
15026
+ score: result.score,
15027
+ verdict: result.score === 1 ? "pass" : "fail",
15028
+ hits: result.hits,
15029
+ misses: result.misses,
15030
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15031
+ expectedAspectCount: 1
15032
+ };
15033
+ });
15034
+ };
15035
+ var endsWithFactory = (config) => {
15036
+ const c = config;
15037
+ return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
15038
+ const result = runEndsWithAssertion(ctx.candidate, c.value);
15039
+ return {
15040
+ score: result.score,
15041
+ verdict: result.score === 1 ? "pass" : "fail",
15042
+ hits: result.hits,
15043
+ misses: result.misses,
15044
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15045
+ expectedAspectCount: 1
15046
+ };
15047
+ });
15048
+ };
14710
15049
  function createBuiltinRegistry() {
14711
15050
  const registry = new EvaluatorRegistry();
14712
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
15051
+ registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
14713
15052
  return registry;
14714
15053
  }
14715
15054
 
@@ -15053,37 +15392,217 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
15053
15392
  }
15054
15393
  }
15055
15394
 
15056
- // src/evaluation/workspace/resolve.ts
15395
+ // src/evaluation/workspace/repo-manager.ts
15396
+ var import_node_child_process7 = require("child_process");
15397
+ var import_node_crypto8 = require("crypto");
15398
+ var import_node_fs11 = require("fs");
15057
15399
  var import_promises27 = require("fs/promises");
15400
+ var import_node_os7 = __toESM(require("os"), 1);
15058
15401
  var import_node_path38 = __toESM(require("path"), 1);
15402
+ var import_node_util5 = require("util");
15403
+ var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
15404
+ var DEFAULT_CACHE_DIR = import_node_path38.default.join(import_node_os7.default.homedir(), ".agentv", "git-cache");
15405
+ var DEFAULT_TIMEOUT_MS2 = 3e5;
15406
+ var LOCK_TIMEOUT_MS = 6e4;
15407
+ function gitEnv() {
15408
+ const env = { ...process.env };
15409
+ for (const key of Object.keys(env)) {
15410
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
15411
+ delete env[key];
15412
+ }
15413
+ }
15414
+ return {
15415
+ ...env,
15416
+ GIT_TERMINAL_PROMPT: "0",
15417
+ GIT_ASKPASS: "",
15418
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
15419
+ };
15420
+ }
15421
+ function cacheKey(source) {
15422
+ const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
15423
+ return (0, import_node_crypto8.createHash)("sha256").update(raw).digest("hex");
15424
+ }
15425
+ function getSourceUrl(source) {
15426
+ return source.type === "git" ? source.url : source.path;
15427
+ }
15428
+ async function git(args, opts) {
15429
+ const { stdout } = await execFileAsync("git", args, {
15430
+ cwd: opts?.cwd,
15431
+ timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
15432
+ env: gitEnv(),
15433
+ maxBuffer: 50 * 1024 * 1024
15434
+ // 50MB
15435
+ });
15436
+ return stdout.trim();
15437
+ }
15438
+ async function acquireLock(lockPath) {
15439
+ const start = Date.now();
15440
+ while (Date.now() - start < LOCK_TIMEOUT_MS) {
15441
+ try {
15442
+ await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
15443
+ return;
15444
+ } catch (err) {
15445
+ if (err.code === "EEXIST") {
15446
+ await new Promise((r) => setTimeout(r, 200));
15447
+ continue;
15448
+ }
15449
+ throw err;
15450
+ }
15451
+ }
15452
+ throw new Error(`Timed out waiting for lock: ${lockPath}`);
15453
+ }
15454
+ async function releaseLock(lockPath) {
15455
+ try {
15456
+ await (0, import_promises27.unlink)(lockPath);
15457
+ } catch {
15458
+ }
15459
+ }
15460
+ var RepoManager = class {
15461
+ cacheDir;
15462
+ constructor(cacheDir) {
15463
+ this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
15464
+ }
15465
+ /**
15466
+ * Ensure a bare mirror cache exists for the given source.
15467
+ * Creates on first access, fetches updates on subsequent calls.
15468
+ * Returns the absolute path to the cache directory.
15469
+ */
15470
+ async ensureCache(source) {
15471
+ const key = cacheKey(source);
15472
+ const cachePath = import_node_path38.default.join(this.cacheDir, key);
15473
+ const lockPath = `${cachePath}.lock`;
15474
+ await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
15475
+ await acquireLock(lockPath);
15476
+ try {
15477
+ if ((0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"))) {
15478
+ await git(["fetch", "--prune"], { cwd: cachePath });
15479
+ } else {
15480
+ await git(["clone", "--mirror", "--bare", getSourceUrl(source), cachePath]);
15481
+ }
15482
+ } finally {
15483
+ await releaseLock(lockPath);
15484
+ }
15485
+ return cachePath;
15486
+ }
15487
+ /**
15488
+ * Clone a repo from cache into the workspace at the configured path.
15489
+ * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
15490
+ */
15491
+ async materialize(repo, workspacePath) {
15492
+ const targetDir = import_node_path38.default.join(workspacePath, repo.path);
15493
+ const cachePath = await this.ensureCache(repo.source);
15494
+ const cloneArgs = ["clone"];
15495
+ if (repo.clone?.depth) {
15496
+ cloneArgs.push("--depth", String(repo.clone.depth));
15497
+ }
15498
+ if (repo.clone?.filter) {
15499
+ cloneArgs.push("--filter", repo.clone.filter);
15500
+ }
15501
+ cloneArgs.push("--no-checkout");
15502
+ const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
15503
+ cloneArgs.push(cloneUrl, targetDir);
15504
+ await git(cloneArgs);
15505
+ if (repo.clone?.sparse?.length) {
15506
+ await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
15507
+ await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
15508
+ }
15509
+ const ref = repo.checkout?.ref ?? "HEAD";
15510
+ const resolve = repo.checkout?.resolve ?? "remote";
15511
+ let resolvedSha;
15512
+ if (resolve === "remote" && repo.source.type === "git") {
15513
+ const url = getSourceUrl(repo.source);
15514
+ try {
15515
+ const lsOutput = await git(["ls-remote", url, ref]);
15516
+ const match = lsOutput.split(" ")[0];
15517
+ if (!match) {
15518
+ throw new Error(`Ref '${ref}' not found on remote ${url}`);
15519
+ }
15520
+ resolvedSha = match;
15521
+ } catch (err) {
15522
+ if (err instanceof Error && err.message.includes("not found")) throw err;
15523
+ resolvedSha = ref;
15524
+ }
15525
+ } else {
15526
+ resolvedSha = ref;
15527
+ }
15528
+ await git(["checkout", resolvedSha], { cwd: targetDir });
15529
+ const ancestor = repo.checkout?.ancestor ?? 0;
15530
+ if (ancestor > 0) {
15531
+ try {
15532
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
15533
+ await git(["checkout", ancestorSha], { cwd: targetDir });
15534
+ } catch {
15535
+ if (repo.clone?.depth) {
15536
+ await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
15537
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
15538
+ await git(["checkout", ancestorSha], { cwd: targetDir });
15539
+ } else {
15540
+ throw new Error(
15541
+ `Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
15542
+ );
15543
+ }
15544
+ }
15545
+ }
15546
+ }
15547
+ /** Materialize all repos into the workspace. */
15548
+ async materializeAll(repos, workspacePath) {
15549
+ for (const repo of repos) {
15550
+ await this.materialize(repo, workspacePath);
15551
+ }
15552
+ }
15553
+ /** Reset repos in workspace to their checkout state. */
15554
+ async reset(repos, workspacePath, strategy) {
15555
+ if (strategy === "recreate") {
15556
+ for (const repo of repos) {
15557
+ const targetDir = import_node_path38.default.join(workspacePath, repo.path);
15558
+ await (0, import_promises27.rm)(targetDir, { recursive: true, force: true });
15559
+ }
15560
+ await this.materializeAll(repos, workspacePath);
15561
+ return;
15562
+ }
15563
+ for (const repo of repos) {
15564
+ const targetDir = import_node_path38.default.join(workspacePath, repo.path);
15565
+ await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
15566
+ await git(["clean", "-fd"], { cwd: targetDir });
15567
+ }
15568
+ }
15569
+ /** Remove the entire cache directory. */
15570
+ async cleanCache() {
15571
+ await (0, import_promises27.rm)(this.cacheDir, { recursive: true, force: true });
15572
+ }
15573
+ };
15574
+
15575
+ // src/evaluation/workspace/resolve.ts
15576
+ var import_promises28 = require("fs/promises");
15577
+ var import_node_path39 = __toESM(require("path"), 1);
15059
15578
  async function resolveWorkspaceTemplate(templatePath) {
15060
15579
  if (!templatePath) {
15061
15580
  return void 0;
15062
15581
  }
15063
- const resolved = import_node_path38.default.resolve(templatePath);
15064
- const stats = await (0, import_promises27.stat)(resolved);
15582
+ const resolved = import_node_path39.default.resolve(templatePath);
15583
+ const stats = await (0, import_promises28.stat)(resolved);
15065
15584
  if (stats.isFile()) {
15066
15585
  return {
15067
- dir: import_node_path38.default.dirname(resolved),
15586
+ dir: import_node_path39.default.dirname(resolved),
15068
15587
  workspaceFile: resolved
15069
15588
  };
15070
15589
  }
15071
15590
  if (!stats.isDirectory()) {
15072
15591
  throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
15073
15592
  }
15074
- const entries = await (0, import_promises27.readdir)(resolved);
15593
+ const entries = await (0, import_promises28.readdir)(resolved);
15075
15594
  const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
15076
15595
  if (workspaceFiles.length === 1) {
15077
15596
  return {
15078
15597
  dir: resolved,
15079
- workspaceFile: import_node_path38.default.join(resolved, workspaceFiles[0])
15598
+ workspaceFile: import_node_path39.default.join(resolved, workspaceFiles[0])
15080
15599
  };
15081
15600
  }
15082
15601
  if (workspaceFiles.length > 1) {
15083
15602
  const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
15084
15603
  return {
15085
15604
  dir: resolved,
15086
- workspaceFile: conventionFile ? import_node_path38.default.join(resolved, conventionFile) : void 0
15605
+ workspaceFile: conventionFile ? import_node_path39.default.join(resolved, conventionFile) : void 0
15087
15606
  };
15088
15607
  }
15089
15608
  return { dir: resolved };
@@ -15158,7 +15677,7 @@ async function runEvaluation(options) {
15158
15677
  );
15159
15678
  useCache = false;
15160
15679
  }
15161
- const evalRunId = (0, import_node_crypto8.randomUUID)();
15680
+ const evalRunId = (0, import_node_crypto9.randomUUID)();
15162
15681
  const evalCases = preloadedEvalCases ?? await loadTests(evalFilePath, repoRoot, { verbose, filter });
15163
15682
  const filteredEvalCases = filterEvalCases(evalCases, filter);
15164
15683
  if (filteredEvalCases.length === 0) {
@@ -15205,6 +15724,11 @@ async function runEvaluation(options) {
15205
15724
  }
15206
15725
  return getOrCreateProvider(resolvedJudge);
15207
15726
  };
15727
+ if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
15728
+ throw new Error(
15729
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
15730
+ );
15731
+ }
15208
15732
  const targetResolver = (name) => {
15209
15733
  const resolved = resolveTargetByName(name);
15210
15734
  if (!resolved) {
@@ -15218,7 +15742,7 @@ async function runEvaluation(options) {
15218
15742
  ];
15219
15743
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
15220
15744
  const typeRegistry = createBuiltinRegistry();
15221
- const discoveryBaseDir = evalFilePath ? import_node_path39.default.dirname(import_node_path39.default.resolve(evalFilePath)) : process.cwd();
15745
+ const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
15222
15746
  await discoverAssertions(typeRegistry, discoveryBaseDir);
15223
15747
  const providerRegistry = createBuiltinProviderRegistry();
15224
15748
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -15273,7 +15797,8 @@ async function runEvaluation(options) {
15273
15797
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
15274
15798
  const workspaceTemplate = resolvedTemplate?.dir;
15275
15799
  const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
15276
- const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all);
15800
+ const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
15801
+ const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
15277
15802
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
15278
15803
  const workers = hasSharedWorkspace ? 1 : requestedWorkers;
15279
15804
  if (hasSharedWorkspace && requestedWorkers > 1) {
@@ -15292,9 +15817,22 @@ async function runEvaluation(options) {
15292
15817
  const message = error instanceof Error ? error.message : String(error);
15293
15818
  throw new Error(`Failed to create shared workspace: ${message}`);
15294
15819
  }
15295
- } else if (suiteWorkspace?.before_all) {
15820
+ } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
15296
15821
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
15297
- await (0, import_promises28.mkdir)(sharedWorkspacePath, { recursive: true });
15822
+ await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
15823
+ }
15824
+ const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
15825
+ if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
15826
+ try {
15827
+ await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
15828
+ } catch (error) {
15829
+ const message = error instanceof Error ? error.message : String(error);
15830
+ if (sharedWorkspacePath) {
15831
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
15832
+ });
15833
+ }
15834
+ throw new Error(`Failed to materialize repos: ${message}`);
15835
+ }
15298
15836
  }
15299
15837
  if (sharedWorkspacePath && suiteWorkspace?.before_all) {
15300
15838
  const scriptContext = {
@@ -15385,7 +15923,8 @@ async function runEvaluation(options) {
15385
15923
  sharedBaselineCommit,
15386
15924
  suiteWorkspaceFile,
15387
15925
  streamCallbacks,
15388
- typeRegistry
15926
+ typeRegistry,
15927
+ repoManager
15389
15928
  };
15390
15929
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
15391
15930
  if (totalBudgetUsd !== void 0) {
@@ -15660,15 +16199,16 @@ async function runEvalCase(options) {
15660
16199
  sharedWorkspacePath,
15661
16200
  sharedBaselineCommit,
15662
16201
  suiteWorkspaceFile,
15663
- typeRegistry: providedTypeRegistry
16202
+ typeRegistry: providedTypeRegistry,
16203
+ repoManager
15664
16204
  } = options;
15665
16205
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
15666
16206
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
15667
16207
  const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
15668
- const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
16208
+ const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
15669
16209
  let cachedResponse;
15670
- if (cacheKey && cache) {
15671
- cachedResponse = await cache.get(cacheKey);
16210
+ if (cacheKey2 && cache) {
16211
+ cachedResponse = await cache.get(cacheKey2);
15672
16212
  }
15673
16213
  const nowFn = now ?? (() => /* @__PURE__ */ new Date());
15674
16214
  let workspacePath = sharedWorkspacePath;
@@ -15697,9 +16237,25 @@ async function runEvalCase(options) {
15697
16237
  );
15698
16238
  }
15699
16239
  }
15700
- if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
16240
+ if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
15701
16241
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
15702
- await (0, import_promises28.mkdir)(workspacePath, { recursive: true });
16242
+ await (0, import_promises29.mkdir)(workspacePath, { recursive: true });
16243
+ }
16244
+ if (evalCase.workspace?.repos?.length && workspacePath) {
16245
+ const perCaseRepoManager = new RepoManager();
16246
+ try {
16247
+ await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
16248
+ } catch (error) {
16249
+ const message = error instanceof Error ? error.message : String(error);
16250
+ return buildErrorResult(
16251
+ evalCase,
16252
+ target.name,
16253
+ nowFn(),
16254
+ new Error(`Failed to materialize repos: ${message}`),
16255
+ promptInputs,
16256
+ provider
16257
+ );
16258
+ }
15703
16259
  }
15704
16260
  if (workspacePath && evalCase.workspace?.before_all) {
15705
16261
  const scriptContext = {
@@ -15823,8 +16379,8 @@ async function runEvalCase(options) {
15823
16379
  }
15824
16380
  return errorResult;
15825
16381
  }
15826
- if (cacheKey && cache && !cachedResponse) {
15827
- await cache.set(cacheKey, providerResponse);
16382
+ if (cacheKey2 && cache && !cachedResponse) {
16383
+ await cache.set(cacheKey2, providerResponse);
15828
16384
  }
15829
16385
  const output = providerResponse.output;
15830
16386
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
@@ -15852,6 +16408,16 @@ async function runEvalCase(options) {
15852
16408
  }
15853
16409
  }
15854
16410
  const providerError = extractProviderError(providerResponse);
16411
+ if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
16412
+ try {
16413
+ await repoManager.reset(
16414
+ evalCase.workspace.repos,
16415
+ workspacePath,
16416
+ evalCase.workspace.reset.strategy
16417
+ );
16418
+ } catch {
16419
+ }
16420
+ }
15855
16421
  if (workspacePath && evalCase.workspace?.after_each) {
15856
16422
  const scriptContext = {
15857
16423
  workspacePath,
@@ -16216,7 +16782,7 @@ async function runEvaluatorList(options) {
16216
16782
  fileChanges,
16217
16783
  workspacePath
16218
16784
  };
16219
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path39.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
16785
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path40.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
16220
16786
  const dispatchContext = {
16221
16787
  judgeProvider,
16222
16788
  targetResolver,
@@ -16306,8 +16872,9 @@ async function runEvaluatorList(options) {
16306
16872
  const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
16307
16873
  return entry.score.score < minScore;
16308
16874
  });
16309
- const aggregateScore = hasRequiredFailure ? 0 : scored.length > 0 ? computeWeightedMean(
16310
- scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
16875
+ const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
16876
+ const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
16877
+ scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
16311
16878
  ) : 0;
16312
16879
  const hits = scored.flatMap((entry) => entry.score.hits);
16313
16880
  const misses = scored.flatMap((entry) => entry.score.misses);
@@ -16447,7 +17014,7 @@ function extractProviderError(response) {
16447
17014
  return trimmed.length > 0 ? trimmed : void 0;
16448
17015
  }
16449
17016
  function createCacheKey(provider, target, evalCase, promptInputs) {
16450
- const hash = (0, import_node_crypto8.createHash)("sha256");
17017
+ const hash = (0, import_node_crypto9.createHash)("sha256");
16451
17018
  hash.update(provider.id);
16452
17019
  hash.update(target.name);
16453
17020
  hash.update(evalCase.id);
@@ -16515,8 +17082,8 @@ function computeWeightedMean(entries) {
16515
17082
  }
16516
17083
 
16517
17084
  // src/evaluation/evaluate.ts
16518
- var import_node_fs11 = require("fs");
16519
- var import_node_path40 = __toESM(require("path"), 1);
17085
+ var import_node_fs12 = require("fs");
17086
+ var import_node_path41 = __toESM(require("path"), 1);
16520
17087
  async function evaluate(config) {
16521
17088
  const startTime = Date.now();
16522
17089
  if (config.tests && config.specFile) {
@@ -16538,13 +17105,13 @@ async function evaluate(config) {
16538
17105
  let evalCases;
16539
17106
  let testFilePath;
16540
17107
  if (config.specFile) {
16541
- testFilePath = import_node_path40.default.resolve(config.specFile);
17108
+ testFilePath = import_node_path41.default.resolve(config.specFile);
16542
17109
  evalCases = await loadTests(testFilePath, repoRoot, {
16543
17110
  verbose: config.verbose,
16544
17111
  filter: config.filter
16545
17112
  });
16546
17113
  } else {
16547
- testFilePath = import_node_path40.default.join(process.cwd(), "__programmatic__.yaml");
17114
+ testFilePath = import_node_path41.default.join(process.cwd(), "__programmatic__.yaml");
16548
17115
  evalCases = (config.tests ?? []).map((test) => {
16549
17116
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
16550
17117
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
@@ -16635,11 +17202,11 @@ function computeSummary(results, durationMs) {
16635
17202
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
16636
17203
  async function discoverDefaultTarget(repoRoot) {
16637
17204
  const cwd = process.cwd();
16638
- const chain = buildDirectoryChain2(import_node_path40.default.join(cwd, "_placeholder"), repoRoot);
17205
+ const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
16639
17206
  for (const dir of chain) {
16640
17207
  for (const candidate of TARGET_FILE_CANDIDATES) {
16641
- const targetsPath = import_node_path40.default.join(dir, candidate);
16642
- if (!(0, import_node_fs11.existsSync)(targetsPath)) continue;
17208
+ const targetsPath = import_node_path41.default.join(dir, candidate);
17209
+ if (!(0, import_node_fs12.existsSync)(targetsPath)) continue;
16643
17210
  try {
16644
17211
  const definitions = await readTargetDefinitions(targetsPath);
16645
17212
  const defaultTarget = definitions.find((d) => d.name === "default");
@@ -16653,11 +17220,11 @@ async function discoverDefaultTarget(repoRoot) {
16653
17220
  async function loadEnvHierarchy(repoRoot) {
16654
17221
  const { readFileSync: readFileSync2 } = await import("fs");
16655
17222
  const cwd = process.cwd();
16656
- const chain = buildDirectoryChain2(import_node_path40.default.join(cwd, "_placeholder"), repoRoot);
17223
+ const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
16657
17224
  const envFiles = [];
16658
17225
  for (const dir of chain) {
16659
- const envPath = import_node_path40.default.join(dir, ".env");
16660
- if ((0, import_node_fs11.existsSync)(envPath)) envFiles.push(envPath);
17226
+ const envPath = import_node_path41.default.join(dir, ".env");
17227
+ if ((0, import_node_fs12.existsSync)(envPath)) envFiles.push(envPath);
16661
17228
  }
16662
17229
  for (let i = envFiles.length - 1; i >= 0; i--) {
16663
17230
  try {
@@ -16727,12 +17294,12 @@ var CONFIG_FILE_NAMES = [
16727
17294
  ".agentv/config.js"
16728
17295
  ];
16729
17296
  async function loadTsConfig(projectRoot) {
16730
- const { existsSync: existsSync3 } = await import("fs");
17297
+ const { existsSync: existsSync4 } = await import("fs");
16731
17298
  const { pathToFileURL } = await import("url");
16732
17299
  const { join: join2 } = await import("path");
16733
17300
  for (const fileName of CONFIG_FILE_NAMES) {
16734
17301
  const filePath = join2(projectRoot, fileName);
16735
- if (!existsSync3(filePath)) {
17302
+ if (!existsSync4(filePath)) {
16736
17303
  continue;
16737
17304
  }
16738
17305
  try {
@@ -16829,8 +17396,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
16829
17396
  }
16830
17397
 
16831
17398
  // src/evaluation/cache/response-cache.ts
16832
- var import_promises29 = require("fs/promises");
16833
- var import_node_path41 = __toESM(require("path"), 1);
17399
+ var import_promises30 = require("fs/promises");
17400
+ var import_node_path42 = __toESM(require("path"), 1);
16834
17401
  var DEFAULT_CACHE_PATH = ".agentv/cache";
16835
17402
  var ResponseCache = class {
16836
17403
  cachePath;
@@ -16840,7 +17407,7 @@ var ResponseCache = class {
16840
17407
  async get(key) {
16841
17408
  const filePath = this.keyToPath(key);
16842
17409
  try {
16843
- const data = await (0, import_promises29.readFile)(filePath, "utf8");
17410
+ const data = await (0, import_promises30.readFile)(filePath, "utf8");
16844
17411
  return JSON.parse(data);
16845
17412
  } catch {
16846
17413
  return void 0;
@@ -16848,13 +17415,13 @@ var ResponseCache = class {
16848
17415
  }
16849
17416
  async set(key, value) {
16850
17417
  const filePath = this.keyToPath(key);
16851
- const dir = import_node_path41.default.dirname(filePath);
16852
- await (0, import_promises29.mkdir)(dir, { recursive: true });
16853
- await (0, import_promises29.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
17418
+ const dir = import_node_path42.default.dirname(filePath);
17419
+ await (0, import_promises30.mkdir)(dir, { recursive: true });
17420
+ await (0, import_promises30.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
16854
17421
  }
16855
17422
  keyToPath(key) {
16856
17423
  const prefix = key.slice(0, 2);
16857
- return import_node_path41.default.join(this.cachePath, prefix, `${key}.json`);
17424
+ return import_node_path42.default.join(this.cachePath, prefix, `${key}.json`);
16858
17425
  }
16859
17426
  };
16860
17427
  function shouldEnableCache(params) {
@@ -17332,6 +17899,7 @@ function createAgentKernel() {
17332
17899
  OtelTraceExporter,
17333
17900
  OtlpJsonFileExporter,
17334
17901
  ProviderRegistry,
17902
+ RepoManager,
17335
17903
  ResponseCache,
17336
17904
  SimpleTraceFileExporter,
17337
17905
  TEST_MESSAGE_ROLES,
@@ -17417,12 +17985,19 @@ function createAgentKernel() {
17417
17985
  resolveTargetDefinition,
17418
17986
  resolveWorkspaceTemplate,
17419
17987
  rubricEvaluationSchema,
17988
+ runContainsAllAssertion,
17989
+ runContainsAnyAssertion,
17420
17990
  runContainsAssertion,
17991
+ runEndsWithAssertion,
17421
17992
  runEqualsAssertion,
17422
17993
  runEvalCase,
17423
17994
  runEvaluation,
17995
+ runIcontainsAllAssertion,
17996
+ runIcontainsAnyAssertion,
17997
+ runIcontainsAssertion,
17424
17998
  runIsJsonAssertion,
17425
17999
  runRegexAssertion,
18000
+ runStartsWithAssertion,
17426
18001
  scoreToVerdict,
17427
18002
  shouldEnableCache,
17428
18003
  shouldSkipCacheForTemperature,