@agentv/core 2.10.0 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1244,12 +1244,12 @@ function serializeAttributeValue(value) {
1244
1244
  if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
1245
1245
  return { stringValue: String(value) };
1246
1246
  }
1247
- var import_promises30, import_node_path42, OtlpJsonFileExporter;
1247
+ var import_promises31, import_node_path43, OtlpJsonFileExporter;
1248
1248
  var init_otlp_json_file_exporter = __esm({
1249
1249
  "src/observability/otlp-json-file-exporter.ts"() {
1250
1250
  "use strict";
1251
- import_promises30 = require("fs/promises");
1252
- import_node_path42 = require("path");
1251
+ import_promises31 = require("fs/promises");
1252
+ import_node_path43 = require("path");
1253
1253
  OtlpJsonFileExporter = class {
1254
1254
  // biome-ignore lint/suspicious/noExplicitAny: serialized span data
1255
1255
  spans = [];
@@ -1288,7 +1288,7 @@ var init_otlp_json_file_exporter = __esm({
1288
1288
  }
1289
1289
  async flush() {
1290
1290
  if (this.spans.length === 0) return;
1291
- await (0, import_promises30.mkdir)((0, import_node_path42.dirname)(this.filePath), { recursive: true });
1291
+ await (0, import_promises31.mkdir)((0, import_node_path43.dirname)(this.filePath), { recursive: true });
1292
1292
  const otlpJson = {
1293
1293
  resourceSpans: [
1294
1294
  {
@@ -1302,8 +1302,8 @@ var init_otlp_json_file_exporter = __esm({
1302
1302
  }
1303
1303
  ]
1304
1304
  };
1305
- const { writeFile: writeFile8 } = await import("fs/promises");
1306
- await writeFile8(this.filePath, JSON.stringify(otlpJson, null, 2));
1305
+ const { writeFile: writeFile9 } = await import("fs/promises");
1306
+ await writeFile9(this.filePath, JSON.stringify(otlpJson, null, 2));
1307
1307
  }
1308
1308
  };
1309
1309
  }
@@ -1319,13 +1319,13 @@ function hrTimeDiffMs(start, end) {
1319
1319
  const diffNano = end[1] - start[1];
1320
1320
  return Math.round(diffSec * 1e3 + diffNano / 1e6);
1321
1321
  }
1322
- var import_node_fs12, import_promises31, import_node_path43, SimpleTraceFileExporter;
1322
+ var import_node_fs13, import_promises32, import_node_path44, SimpleTraceFileExporter;
1323
1323
  var init_simple_trace_file_exporter = __esm({
1324
1324
  "src/observability/simple-trace-file-exporter.ts"() {
1325
1325
  "use strict";
1326
- import_node_fs12 = require("fs");
1327
- import_promises31 = require("fs/promises");
1328
- import_node_path43 = require("path");
1326
+ import_node_fs13 = require("fs");
1327
+ import_promises32 = require("fs/promises");
1328
+ import_node_path44 = require("path");
1329
1329
  SimpleTraceFileExporter = class {
1330
1330
  stream = null;
1331
1331
  filePath;
@@ -1338,8 +1338,8 @@ var init_simple_trace_file_exporter = __esm({
1338
1338
  async ensureStream() {
1339
1339
  if (!this.streamReady) {
1340
1340
  this.streamReady = (async () => {
1341
- await (0, import_promises31.mkdir)((0, import_node_path43.dirname)(this.filePath), { recursive: true });
1342
- this.stream = (0, import_node_fs12.createWriteStream)(this.filePath, { flags: "w" });
1341
+ await (0, import_promises32.mkdir)((0, import_node_path44.dirname)(this.filePath), { recursive: true });
1342
+ this.stream = (0, import_node_fs13.createWriteStream)(this.filePath, { flags: "w" });
1343
1343
  return this.stream;
1344
1344
  })();
1345
1345
  }
@@ -1448,6 +1448,7 @@ __export(index_exports, {
1448
1448
  OtelTraceExporter: () => OtelTraceExporter,
1449
1449
  OtlpJsonFileExporter: () => OtlpJsonFileExporter,
1450
1450
  ProviderRegistry: () => ProviderRegistry,
1451
+ RepoManager: () => RepoManager,
1451
1452
  ResponseCache: () => ResponseCache,
1452
1453
  SimpleTraceFileExporter: () => SimpleTraceFileExporter,
1453
1454
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
@@ -1533,12 +1534,19 @@ __export(index_exports, {
1533
1534
  resolveTargetDefinition: () => resolveTargetDefinition,
1534
1535
  resolveWorkspaceTemplate: () => resolveWorkspaceTemplate,
1535
1536
  rubricEvaluationSchema: () => rubricEvaluationSchema,
1537
+ runContainsAllAssertion: () => runContainsAllAssertion,
1538
+ runContainsAnyAssertion: () => runContainsAnyAssertion,
1536
1539
  runContainsAssertion: () => runContainsAssertion,
1540
+ runEndsWithAssertion: () => runEndsWithAssertion,
1537
1541
  runEqualsAssertion: () => runEqualsAssertion,
1538
1542
  runEvalCase: () => runEvalCase,
1539
1543
  runEvaluation: () => runEvaluation,
1544
+ runIcontainsAllAssertion: () => runIcontainsAllAssertion,
1545
+ runIcontainsAnyAssertion: () => runIcontainsAnyAssertion,
1546
+ runIcontainsAssertion: () => runIcontainsAssertion,
1540
1547
  runIsJsonAssertion: () => runIsJsonAssertion,
1541
1548
  runRegexAssertion: () => runRegexAssertion,
1549
+ runStartsWithAssertion: () => runStartsWithAssertion,
1542
1550
  scoreToVerdict: () => scoreToVerdict,
1543
1551
  shouldEnableCache: () => shouldEnableCache,
1544
1552
  shouldSkipCacheForTemperature: () => shouldSkipCacheForTemperature,
@@ -1615,6 +1623,13 @@ var EVALUATOR_KIND_VALUES = [
1615
1623
  "execution_metrics",
1616
1624
  "agent_judge",
1617
1625
  "contains",
1626
+ "contains_any",
1627
+ "contains_all",
1628
+ "icontains",
1629
+ "icontains_any",
1630
+ "icontains_all",
1631
+ "starts_with",
1632
+ "ends_with",
1618
1633
  "regex",
1619
1634
  "is_json",
1620
1635
  "equals",
@@ -2017,9 +2032,14 @@ async function loadConfig(evalFilePath, repoRoot) {
2017
2032
  logWarning(`Invalid eval_patterns in ${configPath}, all entries must be strings`);
2018
2033
  continue;
2019
2034
  }
2035
+ const executionDefaults = parseExecutionDefaults(
2036
+ parsed.execution,
2037
+ configPath
2038
+ );
2020
2039
  return {
2021
2040
  guideline_patterns: guidelinePatterns,
2022
- eval_patterns: evalPatterns
2041
+ eval_patterns: evalPatterns,
2042
+ execution: executionDefaults
2023
2043
  };
2024
2044
  } catch (error) {
2025
2045
  logWarning(
@@ -2160,6 +2180,36 @@ function extractTotalBudgetUsd(suite) {
2160
2180
  );
2161
2181
  return void 0;
2162
2182
  }
2183
+ function parseExecutionDefaults(raw, configPath) {
2184
+ if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
2185
+ return void 0;
2186
+ }
2187
+ const obj = raw;
2188
+ const result = {};
2189
+ if (typeof obj.verbose === "boolean") {
2190
+ result.verbose = obj.verbose;
2191
+ } else if (obj.verbose !== void 0) {
2192
+ logWarning(`Invalid execution.verbose in ${configPath}, expected boolean`);
2193
+ }
2194
+ const traceFile = obj.trace_file;
2195
+ if (typeof traceFile === "string" && traceFile.trim().length > 0) {
2196
+ result.trace_file = traceFile.trim();
2197
+ } else if (traceFile !== void 0) {
2198
+ logWarning(`Invalid execution.trace_file in ${configPath}, expected non-empty string`);
2199
+ }
2200
+ if (typeof obj.keep_workspaces === "boolean") {
2201
+ result.keep_workspaces = obj.keep_workspaces;
2202
+ } else if (obj.keep_workspaces !== void 0) {
2203
+ logWarning(`Invalid execution.keep_workspaces in ${configPath}, expected boolean`);
2204
+ }
2205
+ const otelFile = obj.otel_file;
2206
+ if (typeof otelFile === "string" && otelFile.trim().length > 0) {
2207
+ result.otel_file = otelFile.trim();
2208
+ } else if (otelFile !== void 0) {
2209
+ logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
2210
+ }
2211
+ return Object.keys(result).length > 0 ? result : void 0;
2212
+ }
2163
2213
  function logWarning(message) {
2164
2214
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
2165
2215
  }
@@ -2888,18 +2938,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2888
2938
  });
2889
2939
  continue;
2890
2940
  }
2941
+ if (typeValue === "contains_any" || typeValue === "contains_all") {
2942
+ const value = asStringArrayStrict(rawEvaluator.value);
2943
+ if (!value || value.length === 0) {
2944
+ logWarning2(
2945
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
2946
+ );
2947
+ continue;
2948
+ }
2949
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2950
+ const required2 = parseRequired(rawEvaluator.required);
2951
+ evaluators.push({
2952
+ name,
2953
+ type: typeValue,
2954
+ value,
2955
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
2956
+ ...required2 !== void 0 ? { required: required2 } : {},
2957
+ ...negate !== void 0 ? { negate } : {}
2958
+ });
2959
+ continue;
2960
+ }
2961
+ if (typeValue === "icontains") {
2962
+ const value = asString(rawEvaluator.value);
2963
+ if (!value) {
2964
+ logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
2965
+ continue;
2966
+ }
2967
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2968
+ const required2 = parseRequired(rawEvaluator.required);
2969
+ evaluators.push({
2970
+ name,
2971
+ type: "icontains",
2972
+ value,
2973
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
2974
+ ...required2 !== void 0 ? { required: required2 } : {},
2975
+ ...negate !== void 0 ? { negate } : {}
2976
+ });
2977
+ continue;
2978
+ }
2979
+ if (typeValue === "icontains_any" || typeValue === "icontains_all") {
2980
+ const value = asStringArrayStrict(rawEvaluator.value);
2981
+ if (!value || value.length === 0) {
2982
+ logWarning2(
2983
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
2984
+ );
2985
+ continue;
2986
+ }
2987
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2988
+ const required2 = parseRequired(rawEvaluator.required);
2989
+ evaluators.push({
2990
+ name,
2991
+ type: typeValue,
2992
+ value,
2993
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
2994
+ ...required2 !== void 0 ? { required: required2 } : {},
2995
+ ...negate !== void 0 ? { negate } : {}
2996
+ });
2997
+ continue;
2998
+ }
2999
+ if (typeValue === "starts_with" || typeValue === "ends_with") {
3000
+ const value = asString(rawEvaluator.value);
3001
+ if (!value) {
3002
+ logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
3003
+ continue;
3004
+ }
3005
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
3006
+ const required2 = parseRequired(rawEvaluator.required);
3007
+ evaluators.push({
3008
+ name,
3009
+ type: typeValue,
3010
+ value,
3011
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
3012
+ ...required2 !== void 0 ? { required: required2 } : {},
3013
+ ...negate !== void 0 ? { negate } : {}
3014
+ });
3015
+ continue;
3016
+ }
2891
3017
  if (typeValue === "regex") {
2892
3018
  const value = asString(rawEvaluator.value);
2893
3019
  if (!value) {
2894
3020
  logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
2895
3021
  continue;
2896
3022
  }
3023
+ const flags = asString(rawEvaluator.flags);
2897
3024
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
2898
3025
  const required2 = parseRequired(rawEvaluator.required);
2899
3026
  evaluators.push({
2900
3027
  name,
2901
3028
  type: "regex",
2902
3029
  value,
3030
+ ...flags !== void 0 ? { flags } : {},
2903
3031
  ...weight2 !== void 0 ? { weight: weight2 } : {},
2904
3032
  ...required2 !== void 0 ? { required: required2 } : {},
2905
3033
  ...negate !== void 0 ? { negate } : {}
@@ -3072,15 +3200,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
3072
3200
  }
3073
3201
  return evaluators.length > 0 ? evaluators : void 0;
3074
3202
  }
3075
- var ASSERTION_TYPES = /* @__PURE__ */ new Set(["contains", "regex", "is_json", "equals", "rubrics"]);
3203
+ var ASSERTION_TYPES = /* @__PURE__ */ new Set([
3204
+ "contains",
3205
+ "contains_any",
3206
+ "contains_all",
3207
+ "icontains",
3208
+ "icontains_any",
3209
+ "icontains_all",
3210
+ "starts_with",
3211
+ "ends_with",
3212
+ "regex",
3213
+ "is_json",
3214
+ "equals",
3215
+ "rubrics"
3216
+ ]);
3076
3217
  function generateAssertionName(typeValue, rawEvaluator) {
3077
3218
  if (!ASSERTION_TYPES.has(typeValue)) {
3078
3219
  return void 0;
3079
3220
  }
3080
3221
  const value = asString(rawEvaluator.value);
3222
+ const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
3081
3223
  switch (typeValue) {
3082
3224
  case "contains":
3083
3225
  return value ? `contains-${value}` : "contains";
3226
+ case "contains_any":
3227
+ return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
3228
+ case "contains_all":
3229
+ return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
3230
+ case "icontains":
3231
+ return value ? `icontains-${value}` : "icontains";
3232
+ case "icontains_any":
3233
+ return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
3234
+ case "icontains_all":
3235
+ return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
3236
+ case "starts_with":
3237
+ return value ? `starts_with-${value}` : "starts_with";
3238
+ case "ends_with":
3239
+ return value ? `ends_with-${value}` : "ends_with";
3084
3240
  case "regex":
3085
3241
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
3086
3242
  case "is_json":
@@ -3106,6 +3262,13 @@ function coerceEvaluator(candidate, contextId) {
3106
3262
  function asString(value) {
3107
3263
  return typeof value === "string" ? value : void 0;
3108
3264
  }
3265
+ function asStringArrayStrict(value) {
3266
+ if (!Array.isArray(value)) {
3267
+ return void 0;
3268
+ }
3269
+ const result = value.filter((v) => typeof v === "string");
3270
+ return result.length > 0 ? result : void 0;
3271
+ }
3109
3272
  function asStringArray(value, description) {
3110
3273
  if (value === void 0) {
3111
3274
  return void 0;
@@ -4423,6 +4586,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
4423
4586
  }
4424
4587
  return cwd ? { ...config, cwd } : config;
4425
4588
  }
4589
+ function parseRepoSource(raw) {
4590
+ if (!isJsonObject(raw)) return void 0;
4591
+ const obj = raw;
4592
+ if (obj.type === "git" && typeof obj.url === "string") {
4593
+ return { type: "git", url: obj.url };
4594
+ }
4595
+ if (obj.type === "local" && typeof obj.path === "string") {
4596
+ return { type: "local", path: obj.path };
4597
+ }
4598
+ return void 0;
4599
+ }
4600
+ function parseRepoCheckout(raw) {
4601
+ if (!isJsonObject(raw)) return void 0;
4602
+ const obj = raw;
4603
+ const ref = typeof obj.ref === "string" ? obj.ref : void 0;
4604
+ const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
4605
+ const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
4606
+ if (!ref && !resolve && ancestor === void 0) return void 0;
4607
+ return {
4608
+ ...ref !== void 0 && { ref },
4609
+ ...resolve !== void 0 && { resolve },
4610
+ ...ancestor !== void 0 && { ancestor }
4611
+ };
4612
+ }
4613
+ function parseRepoClone(raw) {
4614
+ if (!isJsonObject(raw)) return void 0;
4615
+ const obj = raw;
4616
+ const depth = typeof obj.depth === "number" ? obj.depth : void 0;
4617
+ const filter = typeof obj.filter === "string" ? obj.filter : void 0;
4618
+ const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
4619
+ if (depth === void 0 && !filter && !sparse) return void 0;
4620
+ return {
4621
+ ...depth !== void 0 && { depth },
4622
+ ...filter !== void 0 && { filter },
4623
+ ...sparse !== void 0 && { sparse }
4624
+ };
4625
+ }
4626
+ function parseRepoConfig(raw) {
4627
+ if (!isJsonObject(raw)) return void 0;
4628
+ const obj = raw;
4629
+ const repoPath = typeof obj.path === "string" ? obj.path : void 0;
4630
+ const source = parseRepoSource(obj.source);
4631
+ if (!repoPath || !source) return void 0;
4632
+ const checkout = parseRepoCheckout(obj.checkout);
4633
+ const clone = parseRepoClone(obj.clone);
4634
+ return {
4635
+ path: repoPath,
4636
+ source,
4637
+ ...checkout !== void 0 && { checkout },
4638
+ ...clone !== void 0 && { clone }
4639
+ };
4640
+ }
4641
+ function parseResetConfig(raw) {
4642
+ if (!isJsonObject(raw)) return void 0;
4643
+ const obj = raw;
4644
+ const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
4645
+ const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
4646
+ if (!strategy && afterEach === void 0) return void 0;
4647
+ return {
4648
+ ...strategy !== void 0 && { strategy },
4649
+ ...afterEach !== void 0 && { after_each: afterEach }
4650
+ };
4651
+ }
4426
4652
  function parseWorkspaceConfig(raw, evalFileDir) {
4427
4653
  if (!isJsonObject(raw)) return void 0;
4428
4654
  const obj = raw;
@@ -4430,13 +4656,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
4430
4656
  if (template && !import_node_path8.default.isAbsolute(template)) {
4431
4657
  template = import_node_path8.default.resolve(evalFileDir, template);
4432
4658
  }
4659
+ const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
4660
+ const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
4661
+ const reset = parseResetConfig(obj.reset);
4433
4662
  const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
4434
4663
  const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
4435
4664
  const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
4436
4665
  const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
4437
- if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach) return void 0;
4666
+ if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
4667
+ return void 0;
4438
4668
  return {
4439
4669
  ...template !== void 0 && { template },
4670
+ ...isolation !== void 0 && { isolation },
4671
+ ...repos !== void 0 && { repos },
4672
+ ...reset !== void 0 && { reset },
4440
4673
  ...beforeAll !== void 0 && { before_all: beforeAll },
4441
4674
  ...afterAll !== void 0 && { after_all: afterAll },
4442
4675
  ...beforeEach !== void 0 && { before_each: beforeEach },
@@ -4449,6 +4682,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
4449
4682
  if (!caseLevel) return suiteLevel;
4450
4683
  return {
4451
4684
  template: caseLevel.template ?? suiteLevel.template,
4685
+ isolation: caseLevel.isolation ?? suiteLevel.isolation,
4686
+ repos: caseLevel.repos ?? suiteLevel.repos,
4687
+ reset: caseLevel.reset ?? suiteLevel.reset,
4452
4688
  before_all: caseLevel.before_all ?? suiteLevel.before_all,
4453
4689
  after_all: caseLevel.after_all ?? suiteLevel.after_all,
4454
4690
  before_each: caseLevel.before_each ?? suiteLevel.before_each,
@@ -5103,11 +5339,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
5103
5339
  }
5104
5340
  return claudeSdkModule;
5105
5341
  }
5106
- var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
5107
- - Do NOT create any additional output files in the workspace.
5108
- - All intended file outputs/changes MUST be written in your response.
5109
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
5110
- This is required for evaluation scoring.`;
5111
5342
  var ClaudeProvider = class {
5112
5343
  id;
5113
5344
  kind = "claude";
@@ -5129,7 +5360,7 @@ var ClaudeProvider = class {
5129
5360
  const logger = await this.createStreamLogger(request).catch(() => void 0);
5130
5361
  const inputFiles = normalizeInputFiles(request.inputFiles);
5131
5362
  const prompt = buildPromptDocument(request, inputFiles);
5132
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT2);
5363
+ const systemPrompt = this.config.systemPrompt;
5133
5364
  const queryOptions = {
5134
5365
  permissionMode: "bypassPermissions",
5135
5366
  allowDangerouslySkipPermissions: true,
@@ -6110,11 +6341,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
6110
6341
  }
6111
6342
  return codexSdkModule;
6112
6343
  }
6113
- var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
6114
- - Do NOT create any additional output files in the workspace.
6115
- - All intended file outputs/changes MUST be written in your response.
6116
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
6117
- This is required for evaluation scoring.`;
6118
6344
  var CodexProvider = class {
6119
6345
  id;
6120
6346
  kind = "codex";
@@ -6149,7 +6375,7 @@ var CodexProvider = class {
6149
6375
  const thread = codex.startThread(threadOptions);
6150
6376
  const inputFiles = normalizeInputFiles(request.inputFiles);
6151
6377
  const basePrompt = buildPromptDocument(request, inputFiles);
6152
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT3);
6378
+ const systemPrompt = this.config.systemPrompt;
6153
6379
  const prompt = systemPrompt ? `${systemPrompt}
6154
6380
 
6155
6381
  ${basePrompt}` : basePrompt;
@@ -6516,7 +6742,7 @@ var import_node_path14 = __toESM(require("path"), 1);
6516
6742
  var import_node_url2 = require("url");
6517
6743
  var import_meta = {};
6518
6744
  function resolvePlatformCliPath() {
6519
- const os4 = (0, import_node_os2.platform)();
6745
+ const os5 = (0, import_node_os2.platform)();
6520
6746
  const cpu = (0, import_node_os2.arch)();
6521
6747
  const platformMap = {
6522
6748
  linux: "linux",
@@ -6527,13 +6753,13 @@ function resolvePlatformCliPath() {
6527
6753
  x64: "x64",
6528
6754
  arm64: "arm64"
6529
6755
  };
6530
- const osPart = platformMap[os4];
6756
+ const osPart = platformMap[os5];
6531
6757
  const archPart = archMap[cpu];
6532
6758
  if (!osPart || !archPart) {
6533
6759
  return void 0;
6534
6760
  }
6535
6761
  const packageName = `@github/copilot-${osPart}-${archPart}`;
6536
- const binaryName = os4 === "win32" ? "copilot.exe" : "copilot";
6762
+ const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
6537
6763
  try {
6538
6764
  const resolved = import_meta.resolve(`${packageName}/package.json`);
6539
6765
  const packageJsonPath = resolved.startsWith("file:") ? (0, import_node_url2.fileURLToPath)(resolved) : resolved;
@@ -6675,11 +6901,6 @@ function isLogStreamingDisabled(envKey) {
6675
6901
  }
6676
6902
 
6677
6903
  // src/evaluation/providers/copilot-cli.ts
6678
- var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
6679
- - Do NOT create any additional output files in the workspace.
6680
- - All intended file outputs/changes MUST be written in your response.
6681
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
6682
- This is required for evaluation scoring.`;
6683
6904
  var CopilotCliProvider = class {
6684
6905
  id;
6685
6906
  kind = "copilot-cli";
@@ -6841,6 +7062,16 @@ var CopilotCliProvider = class {
6841
7062
  }
6842
7063
  const endTime = (/* @__PURE__ */ new Date()).toISOString();
6843
7064
  const durationMs = Date.now() - startMs;
7065
+ const rejectedCalls = completedToolCalls.filter((tc) => {
7066
+ const out = tc.output;
7067
+ return out && (out.code === "rejected" || out.code === "denied");
7068
+ });
7069
+ if (rejectedCalls.length > 0) {
7070
+ const tools = rejectedCalls.map((tc) => tc.tool).join(", ");
7071
+ throw new Error(
7072
+ `Copilot rejected ${rejectedCalls.length} tool call(s): ${tools}. Add args: ["--yolo"] to your target config or re-run with --yolo to bypass permission checks.`
7073
+ );
7074
+ }
6844
7075
  const outputMessages = [];
6845
7076
  if (completedToolCalls.length > 0) {
6846
7077
  outputMessages.push({
@@ -6873,7 +7104,7 @@ var CopilotCliProvider = class {
6873
7104
  }
6874
7105
  }
6875
7106
  buildCliArgs() {
6876
- const args = ["--acp", "--stdio", "--allow-all-tools"];
7107
+ const args = ["--acp", "--stdio", "--allow-all-tools", "--yolo"];
6877
7108
  if (this.config.model) {
6878
7109
  args.push("--model", this.config.model);
6879
7110
  }
@@ -6882,8 +7113,8 @@ var CopilotCliProvider = class {
6882
7113
  }
6883
7114
  return args;
6884
7115
  }
6885
- resolveSystemPrompt(request) {
6886
- return this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT4);
7116
+ resolveSystemPrompt(_request) {
7117
+ return this.config.systemPrompt;
6887
7118
  }
6888
7119
  async raceWithTimeout(sendPromise, agentProcess) {
6889
7120
  const timeoutMs = this.config.timeoutMs;
@@ -7071,21 +7302,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
7071
7302
  }
7072
7303
  return copilotSdkModule;
7073
7304
  }
7074
- var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
7075
- - Do NOT create any additional output files in the workspace.
7076
- - All intended file outputs/changes MUST be written in your response.
7077
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
7078
- This is required for evaluation scoring.`;
7079
7305
  var CopilotSdkProvider = class {
7080
7306
  id;
7081
- kind = "copilot";
7307
+ kind = "copilot-sdk";
7082
7308
  targetName;
7083
7309
  supportsBatch = false;
7084
7310
  config;
7085
7311
  // biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
7086
7312
  client = null;
7087
7313
  constructor(targetName, config) {
7088
- this.id = `copilot:${targetName}`;
7314
+ this.id = `copilot-sdk:${targetName}`;
7089
7315
  this.targetName = targetName;
7090
7316
  this.config = config;
7091
7317
  }
@@ -7108,7 +7334,7 @@ var CopilotSdkProvider = class {
7108
7334
  if (cwd) {
7109
7335
  sessionOptions.workingDirectory = cwd;
7110
7336
  }
7111
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT5);
7337
+ const systemPrompt = this.config.systemPrompt;
7112
7338
  if (systemPrompt) {
7113
7339
  sessionOptions.systemMessage = {
7114
7340
  mode: "append",
@@ -7624,11 +7850,6 @@ function subscribeToPiLogEntries(listener) {
7624
7850
  // src/evaluation/providers/pi-coding-agent.ts
7625
7851
  var WORKSPACE_PREFIX = "agentv-pi-";
7626
7852
  var PROMPT_FILENAME = "prompt.md";
7627
- var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
7628
- - Do NOT create any additional output files in the workspace.
7629
- - All intended file outputs/changes MUST be written in your response.
7630
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
7631
- This is required for evaluation scoring.`;
7632
7853
  var PiCodingAgentProvider = class {
7633
7854
  id;
7634
7855
  kind = "pi-coding-agent";
@@ -7705,7 +7926,7 @@ var PiCodingAgentProvider = class {
7705
7926
  }
7706
7927
  return import_node_path17.default.resolve(this.config.cwd);
7707
7928
  }
7708
- buildPiArgs(prompt, inputFiles, captureFileChanges2) {
7929
+ buildPiArgs(prompt, inputFiles, _captureFileChanges) {
7709
7930
  const args = [];
7710
7931
  if (this.config.provider) {
7711
7932
  args.push("--provider", this.config.provider);
@@ -7733,7 +7954,7 @@ var PiCodingAgentProvider = class {
7733
7954
  args.push(`@${file}`);
7734
7955
  }
7735
7956
  }
7736
- const systemPrompt = this.config.systemPrompt ?? (captureFileChanges2 ? void 0 : DEFAULT_SYSTEM_PROMPT6);
7957
+ const systemPrompt = this.config.systemPrompt;
7737
7958
  const fullPrompt = systemPrompt ? `${systemPrompt}
7738
7959
 
7739
7960
  ${prompt}` : prompt;
@@ -8604,17 +8825,17 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
8604
8825
  providerBatching,
8605
8826
  config: resolveCodexConfig(parsed, env, evalFilePath)
8606
8827
  };
8607
- case "copilot":
8608
8828
  case "copilot-sdk":
8609
8829
  case "copilot_sdk":
8610
8830
  return {
8611
- kind: "copilot",
8831
+ kind: "copilot-sdk",
8612
8832
  name: parsed.name,
8613
8833
  judgeTarget: parsed.judge_target,
8614
8834
  workers: parsed.workers,
8615
8835
  providerBatching,
8616
8836
  config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
8617
8837
  };
8838
+ case "copilot":
8618
8839
  case "copilot-cli":
8619
8840
  return {
8620
8841
  kind: "copilot-cli",
@@ -9225,8 +9446,8 @@ function resolveCliConfig(target, env, evalFilePath) {
9225
9446
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
9226
9447
  if (!parseResult.success) {
9227
9448
  const firstError = parseResult.error.errors[0];
9228
- const path41 = firstError?.path.join(".") || "";
9229
- const prefix = path41 ? `${target.name} ${path41}: ` : `${target.name}: `;
9449
+ const path42 = firstError?.path.join(".") || "";
9450
+ const prefix = path42 ? `${target.name} ${path42}: ` : `${target.name}: `;
9230
9451
  throw new Error(`${prefix}${firstError?.message}`);
9231
9452
  }
9232
9453
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -10523,7 +10744,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
10523
10744
 
10524
10745
  **IMPORTANT**: Follow these exact steps:
10525
10746
  1. Create and write your complete response to: {{responseFileTmp}}
10526
- - Do NOT create any additional output files in the workspace.
10527
10747
  - All intended file outputs/changes MUST be written in your response file.
10528
10748
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
10529
10749
  2. When completely finished, run these PowerShell commands to signal completion:
@@ -10542,7 +10762,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
10542
10762
 
10543
10763
  **IMPORTANT**: Follow these exact steps:
10544
10764
  1. Create and write your complete response to: {{responseFileTmp}}
10545
- - Do NOT create any additional output files in the workspace.
10546
10765
  - All intended file outputs/changes MUST be written in your response file.
10547
10766
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
10548
10767
  2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
@@ -10968,7 +11187,7 @@ async function discoverProviders(registry, baseDir) {
10968
11187
  // src/evaluation/providers/index.ts
10969
11188
  function createBuiltinProviderRegistry() {
10970
11189
  const registry = new ProviderRegistry();
10971
- registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
11190
+ registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
10972
11191
  "vscode-insiders",
10973
11192
  (t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
10974
11193
  );
@@ -11157,16 +11376,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
11157
11376
  });
11158
11377
  }
11159
11378
  async function execShellWithStdin(command, stdinPayload, options = {}) {
11160
- const { mkdir: mkdir15, readFile: readFile13, rm: rm5, writeFile: writeFile8 } = await import("fs/promises");
11379
+ const { mkdir: mkdir16, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
11161
11380
  const { tmpdir: tmpdir3 } = await import("os");
11162
- const path41 = await import("path");
11381
+ const path42 = await import("path");
11163
11382
  const { randomUUID: randomUUID8 } = await import("crypto");
11164
- const dir = path41.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
11165
- await mkdir15(dir, { recursive: true });
11166
- const stdinPath = path41.join(dir, "stdin.txt");
11167
- const stdoutPath = path41.join(dir, "stdout.txt");
11168
- const stderrPath = path41.join(dir, "stderr.txt");
11169
- await writeFile8(stdinPath, stdinPayload, "utf8");
11383
+ const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
11384
+ await mkdir16(dir, { recursive: true });
11385
+ const stdinPath = path42.join(dir, "stdin.txt");
11386
+ const stdoutPath = path42.join(dir, "stdout.txt");
11387
+ const stderrPath = path42.join(dir, "stderr.txt");
11388
+ await writeFile9(stdinPath, stdinPayload, "utf8");
11170
11389
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
11171
11390
  const { spawn: spawn4 } = await import("child_process");
11172
11391
  try {
@@ -11199,7 +11418,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
11199
11418
  const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
11200
11419
  return { stdout, stderr, exitCode };
11201
11420
  } finally {
11202
- await rm5(dir, { recursive: true, force: true });
11421
+ await rm6(dir, { recursive: true, force: true });
11203
11422
  }
11204
11423
  }
11205
11424
 
@@ -11517,7 +11736,7 @@ var CodeEvaluator = class {
11517
11736
  outputPath,
11518
11737
  guidelineFiles: context2.evalCase.guideline_paths,
11519
11738
  inputFiles: context2.evalCase.file_paths.filter(
11520
- (path41) => !context2.evalCase.guideline_paths.includes(path41)
11739
+ (path42) => !context2.evalCase.guideline_paths.includes(path42)
11521
11740
  ),
11522
11741
  input: context2.evalCase.input,
11523
11742
  trace: context2.trace ?? null,
@@ -11648,7 +11867,7 @@ var import_ai3 = require("ai");
11648
11867
  // src/evaluation/providers/types.ts
11649
11868
  var AGENT_PROVIDER_KINDS = [
11650
11869
  "codex",
11651
- "copilot",
11870
+ "copilot-sdk",
11652
11871
  "copilot-cli",
11653
11872
  "pi-coding-agent",
11654
11873
  "claude",
@@ -11794,13 +12013,15 @@ ${context2.fileChanges}`;
11794
12013
  evaluatorRawRequest,
11795
12014
  tokenUsage
11796
12015
  };
11797
- } catch {
12016
+ } catch (e) {
12017
+ const message = e instanceof Error ? e.message : String(e);
11798
12018
  return {
11799
12019
  score: 0,
11800
- verdict: "fail",
12020
+ verdict: "skip",
11801
12021
  hits: [],
11802
- misses: [],
12022
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
11803
12023
  expectedAspectCount: 1,
12024
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
11804
12025
  evaluatorRawRequest
11805
12026
  };
11806
12027
  }
@@ -12742,115 +12963,115 @@ var FieldAccuracyEvaluator = class {
12742
12963
  * Evaluate a single field against the expected value.
12743
12964
  */
12744
12965
  evaluateField(fieldConfig, candidateData, expectedData) {
12745
- const { path: path41, match, required = true, weight = 1 } = fieldConfig;
12746
- const candidateValue = resolvePath(candidateData, path41);
12747
- const expectedValue = resolvePath(expectedData, path41);
12966
+ const { path: path42, match, required = true, weight = 1 } = fieldConfig;
12967
+ const candidateValue = resolvePath(candidateData, path42);
12968
+ const expectedValue = resolvePath(expectedData, path42);
12748
12969
  if (expectedValue === void 0) {
12749
12970
  return {
12750
- path: path41,
12971
+ path: path42,
12751
12972
  score: 1,
12752
12973
  // No expected value means no comparison needed
12753
12974
  weight,
12754
12975
  hit: true,
12755
- message: `${path41}: no expected value`
12976
+ message: `${path42}: no expected value`
12756
12977
  };
12757
12978
  }
12758
12979
  if (candidateValue === void 0) {
12759
12980
  if (required) {
12760
12981
  return {
12761
- path: path41,
12982
+ path: path42,
12762
12983
  score: 0,
12763
12984
  weight,
12764
12985
  hit: false,
12765
- message: `${path41} (required, missing)`
12986
+ message: `${path42} (required, missing)`
12766
12987
  };
12767
12988
  }
12768
12989
  return {
12769
- path: path41,
12990
+ path: path42,
12770
12991
  score: 1,
12771
12992
  // Don't penalize missing optional fields
12772
12993
  weight: 0,
12773
12994
  // Zero weight means it won't affect the score
12774
12995
  hit: true,
12775
- message: `${path41}: optional field missing`
12996
+ message: `${path42}: optional field missing`
12776
12997
  };
12777
12998
  }
12778
12999
  switch (match) {
12779
13000
  case "exact":
12780
- return this.compareExact(path41, candidateValue, expectedValue, weight);
13001
+ return this.compareExact(path42, candidateValue, expectedValue, weight);
12781
13002
  case "numeric_tolerance":
12782
13003
  return this.compareNumericTolerance(
12783
- path41,
13004
+ path42,
12784
13005
  candidateValue,
12785
13006
  expectedValue,
12786
13007
  fieldConfig,
12787
13008
  weight
12788
13009
  );
12789
13010
  case "date":
12790
- return this.compareDate(path41, candidateValue, expectedValue, fieldConfig, weight);
13011
+ return this.compareDate(path42, candidateValue, expectedValue, fieldConfig, weight);
12791
13012
  default:
12792
13013
  return {
12793
- path: path41,
13014
+ path: path42,
12794
13015
  score: 0,
12795
13016
  weight,
12796
13017
  hit: false,
12797
- message: `${path41}: unknown match type "${match}"`
13018
+ message: `${path42}: unknown match type "${match}"`
12798
13019
  };
12799
13020
  }
12800
13021
  }
12801
13022
  /**
12802
13023
  * Exact equality comparison.
12803
13024
  */
12804
- compareExact(path41, candidateValue, expectedValue, weight) {
13025
+ compareExact(path42, candidateValue, expectedValue, weight) {
12805
13026
  if (deepEqual(candidateValue, expectedValue)) {
12806
13027
  return {
12807
- path: path41,
13028
+ path: path42,
12808
13029
  score: 1,
12809
13030
  weight,
12810
13031
  hit: true,
12811
- message: path41
13032
+ message: path42
12812
13033
  };
12813
13034
  }
12814
13035
  if (typeof candidateValue !== typeof expectedValue) {
12815
13036
  return {
12816
- path: path41,
13037
+ path: path42,
12817
13038
  score: 0,
12818
13039
  weight,
12819
13040
  hit: false,
12820
- message: `${path41} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13041
+ message: `${path42} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12821
13042
  };
12822
13043
  }
12823
13044
  return {
12824
- path: path41,
13045
+ path: path42,
12825
13046
  score: 0,
12826
13047
  weight,
12827
13048
  hit: false,
12828
- message: `${path41} (value mismatch)`
13049
+ message: `${path42} (value mismatch)`
12829
13050
  };
12830
13051
  }
12831
13052
  /**
12832
13053
  * Numeric comparison with absolute or relative tolerance.
12833
13054
  */
12834
- compareNumericTolerance(path41, candidateValue, expectedValue, fieldConfig, weight) {
13055
+ compareNumericTolerance(path42, candidateValue, expectedValue, fieldConfig, weight) {
12835
13056
  const { tolerance = 0, relative = false } = fieldConfig;
12836
13057
  const candidateNum = toNumber2(candidateValue);
12837
13058
  const expectedNum = toNumber2(expectedValue);
12838
13059
  if (candidateNum === null || expectedNum === null) {
12839
13060
  return {
12840
- path: path41,
13061
+ path: path42,
12841
13062
  score: 0,
12842
13063
  weight,
12843
13064
  hit: false,
12844
- message: `${path41} (non-numeric value)`
13065
+ message: `${path42} (non-numeric value)`
12845
13066
  };
12846
13067
  }
12847
13068
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12848
13069
  return {
12849
- path: path41,
13070
+ path: path42,
12850
13071
  score: 0,
12851
13072
  weight,
12852
13073
  hit: false,
12853
- message: `${path41} (invalid numeric value)`
13074
+ message: `${path42} (invalid numeric value)`
12854
13075
  };
12855
13076
  }
12856
13077
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12863,61 +13084,61 @@ var FieldAccuracyEvaluator = class {
12863
13084
  }
12864
13085
  if (withinTolerance) {
12865
13086
  return {
12866
- path: path41,
13087
+ path: path42,
12867
13088
  score: 1,
12868
13089
  weight,
12869
13090
  hit: true,
12870
- message: `${path41} (within tolerance: diff=${diff.toFixed(2)})`
13091
+ message: `${path42} (within tolerance: diff=${diff.toFixed(2)})`
12871
13092
  };
12872
13093
  }
12873
13094
  return {
12874
- path: path41,
13095
+ path: path42,
12875
13096
  score: 0,
12876
13097
  weight,
12877
13098
  hit: false,
12878
- message: `${path41} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13099
+ message: `${path42} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12879
13100
  };
12880
13101
  }
12881
13102
  /**
12882
13103
  * Date comparison with format normalization.
12883
13104
  */
12884
- compareDate(path41, candidateValue, expectedValue, fieldConfig, weight) {
13105
+ compareDate(path42, candidateValue, expectedValue, fieldConfig, weight) {
12885
13106
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12886
13107
  const candidateDate = parseDate(String(candidateValue), formats);
12887
13108
  const expectedDate = parseDate(String(expectedValue), formats);
12888
13109
  if (candidateDate === null) {
12889
13110
  return {
12890
- path: path41,
13111
+ path: path42,
12891
13112
  score: 0,
12892
13113
  weight,
12893
13114
  hit: false,
12894
- message: `${path41} (unparseable candidate date)`
13115
+ message: `${path42} (unparseable candidate date)`
12895
13116
  };
12896
13117
  }
12897
13118
  if (expectedDate === null) {
12898
13119
  return {
12899
- path: path41,
13120
+ path: path42,
12900
13121
  score: 0,
12901
13122
  weight,
12902
13123
  hit: false,
12903
- message: `${path41} (unparseable expected date)`
13124
+ message: `${path42} (unparseable expected date)`
12904
13125
  };
12905
13126
  }
12906
13127
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12907
13128
  return {
12908
- path: path41,
13129
+ path: path42,
12909
13130
  score: 1,
12910
13131
  weight,
12911
13132
  hit: true,
12912
- message: path41
13133
+ message: path42
12913
13134
  };
12914
13135
  }
12915
13136
  return {
12916
- path: path41,
13137
+ path: path42,
12917
13138
  score: 0,
12918
13139
  weight,
12919
13140
  hit: false,
12920
- message: `${path41} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13141
+ message: `${path42} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12921
13142
  };
12922
13143
  }
12923
13144
  /**
@@ -12958,11 +13179,11 @@ var FieldAccuracyEvaluator = class {
12958
13179
  };
12959
13180
  }
12960
13181
  };
12961
- function resolvePath(obj, path41) {
12962
- if (!path41 || !obj) {
13182
+ function resolvePath(obj, path42) {
13183
+ if (!path42 || !obj) {
12963
13184
  return void 0;
12964
13185
  }
12965
- const parts = path41.split(/\.|\[|\]/).filter((p) => p.length > 0);
13186
+ const parts = path42.split(/\.|\[|\]/).filter((p) => p.length > 0);
12966
13187
  let current = obj;
12967
13188
  for (const part of parts) {
12968
13189
  if (current === null || current === void 0) {
@@ -13780,8 +14001,8 @@ var TokenUsageEvaluator = class {
13780
14001
  };
13781
14002
 
13782
14003
  // src/evaluation/evaluators/tool-trajectory.ts
13783
- function getNestedValue(obj, path41) {
13784
- const parts = path41.split(".");
14004
+ function getNestedValue(obj, path42) {
14005
+ const parts = path42.split(".");
13785
14006
  let current = obj;
13786
14007
  for (const part of parts) {
13787
14008
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -14245,13 +14466,78 @@ function runContainsAssertion(output, value) {
14245
14466
  misses: passed ? [] : [`Output does not contain "${value}"`]
14246
14467
  };
14247
14468
  }
14248
- function runRegexAssertion(output, pattern) {
14249
- const regex = new RegExp(pattern);
14469
+ function runContainsAnyAssertion(output, values) {
14470
+ const matched = values.filter((v) => output.includes(v));
14471
+ const passed = matched.length > 0;
14472
+ return {
14473
+ score: passed ? 1 : 0,
14474
+ hits: passed ? [`Output contains "${matched[0]}"`] : [],
14475
+ misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
14476
+ };
14477
+ }
14478
+ function runContainsAllAssertion(output, values) {
14479
+ const missing = values.filter((v) => !output.includes(v));
14480
+ const passed = missing.length === 0;
14481
+ return {
14482
+ score: passed ? 1 : 0,
14483
+ hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
14484
+ misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
14485
+ };
14486
+ }
14487
+ function runIcontainsAssertion(output, value) {
14488
+ const passed = output.toLowerCase().includes(value.toLowerCase());
14489
+ return {
14490
+ score: passed ? 1 : 0,
14491
+ hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
14492
+ misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
14493
+ };
14494
+ }
14495
+ function runIcontainsAnyAssertion(output, values) {
14496
+ const lower = output.toLowerCase();
14497
+ const matched = values.filter((v) => lower.includes(v.toLowerCase()));
14498
+ const passed = matched.length > 0;
14499
+ return {
14500
+ score: passed ? 1 : 0,
14501
+ hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
14502
+ misses: passed ? [] : [
14503
+ `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
14504
+ ]
14505
+ };
14506
+ }
14507
+ function runIcontainsAllAssertion(output, values) {
14508
+ const lower = output.toLowerCase();
14509
+ const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
14510
+ const passed = missing.length === 0;
14511
+ return {
14512
+ score: passed ? 1 : 0,
14513
+ hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
14514
+ misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
14515
+ };
14516
+ }
14517
+ function runStartsWithAssertion(output, value) {
14518
+ const passed = output.trim().startsWith(value.trim());
14519
+ return {
14520
+ score: passed ? 1 : 0,
14521
+ hits: passed ? [`Output starts with "${value}"`] : [],
14522
+ misses: passed ? [] : [`Output does not start with "${value}"`]
14523
+ };
14524
+ }
14525
+ function runEndsWithAssertion(output, value) {
14526
+ const passed = output.trim().endsWith(value.trim());
14527
+ return {
14528
+ score: passed ? 1 : 0,
14529
+ hits: passed ? [`Output ends with "${value}"`] : [],
14530
+ misses: passed ? [] : [`Output does not end with "${value}"`]
14531
+ };
14532
+ }
14533
+ function runRegexAssertion(output, pattern, flags) {
14534
+ const regex = new RegExp(pattern, flags);
14250
14535
  const passed = regex.test(output);
14536
+ const flagsLabel = flags ? ` (flags: ${flags})` : "";
14251
14537
  return {
14252
14538
  score: passed ? 1 : 0,
14253
- hits: passed ? [`Output matches pattern /${pattern}/`] : [],
14254
- misses: passed ? [] : [`Output does not match pattern /${pattern}/`]
14539
+ hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
14540
+ misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
14255
14541
  };
14256
14542
  }
14257
14543
  function runIsJsonAssertion(output) {
@@ -14277,9 +14563,9 @@ function runEqualsAssertion(output, value) {
14277
14563
  }
14278
14564
 
14279
14565
  // src/evaluation/orchestrator.ts
14280
- var import_node_crypto8 = require("crypto");
14281
- var import_promises28 = require("fs/promises");
14282
- var import_node_path39 = __toESM(require("path"), 1);
14566
+ var import_node_crypto9 = require("crypto");
14567
+ var import_promises29 = require("fs/promises");
14568
+ var import_node_path40 = __toESM(require("path"), 1);
14283
14569
  var import_micromatch4 = __toESM(require("micromatch"), 1);
14284
14570
 
14285
14571
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -14669,13 +14955,13 @@ var containsFactory = (config) => {
14669
14955
  var regexFactory = (config) => {
14670
14956
  const c = config;
14671
14957
  return new DeterministicAssertionEvaluator("regex", (ctx) => {
14672
- const result = runRegexAssertion(ctx.candidate, c.value);
14958
+ const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
14673
14959
  return {
14674
14960
  score: result.score,
14675
14961
  verdict: result.score === 1 ? "pass" : "fail",
14676
14962
  hits: result.hits,
14677
14963
  misses: result.misses,
14678
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/` : `Output does not match pattern /${c.value}/`,
14964
+ reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
14679
14965
  expectedAspectCount: 1
14680
14966
  };
14681
14967
  });
@@ -14707,9 +14993,107 @@ var equalsFactory = (config) => {
14707
14993
  };
14708
14994
  });
14709
14995
  };
14996
+ var containsAnyFactory = (config) => {
14997
+ const c = config;
14998
+ return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
14999
+ const result = runContainsAnyAssertion(ctx.candidate, c.value);
15000
+ return {
15001
+ score: result.score,
15002
+ verdict: result.score === 1 ? "pass" : "fail",
15003
+ hits: result.hits,
15004
+ misses: result.misses,
15005
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15006
+ expectedAspectCount: 1
15007
+ };
15008
+ });
15009
+ };
15010
+ var containsAllFactory = (config) => {
15011
+ const c = config;
15012
+ return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
15013
+ const result = runContainsAllAssertion(ctx.candidate, c.value);
15014
+ return {
15015
+ score: result.score,
15016
+ verdict: result.score === 1 ? "pass" : "fail",
15017
+ hits: result.hits,
15018
+ misses: result.misses,
15019
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15020
+ expectedAspectCount: 1
15021
+ };
15022
+ });
15023
+ };
15024
+ var icontainsFactory = (config) => {
15025
+ const c = config;
15026
+ return new DeterministicAssertionEvaluator("icontains", (ctx) => {
15027
+ const result = runIcontainsAssertion(ctx.candidate, c.value);
15028
+ return {
15029
+ score: result.score,
15030
+ verdict: result.score === 1 ? "pass" : "fail",
15031
+ hits: result.hits,
15032
+ misses: result.misses,
15033
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15034
+ expectedAspectCount: 1
15035
+ };
15036
+ });
15037
+ };
15038
+ var icontainsAnyFactory = (config) => {
15039
+ const c = config;
15040
+ return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
15041
+ const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
15042
+ return {
15043
+ score: result.score,
15044
+ verdict: result.score === 1 ? "pass" : "fail",
15045
+ hits: result.hits,
15046
+ misses: result.misses,
15047
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15048
+ expectedAspectCount: 1
15049
+ };
15050
+ });
15051
+ };
15052
+ var icontainsAllFactory = (config) => {
15053
+ const c = config;
15054
+ return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
15055
+ const result = runIcontainsAllAssertion(ctx.candidate, c.value);
15056
+ return {
15057
+ score: result.score,
15058
+ verdict: result.score === 1 ? "pass" : "fail",
15059
+ hits: result.hits,
15060
+ misses: result.misses,
15061
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15062
+ expectedAspectCount: 1
15063
+ };
15064
+ });
15065
+ };
15066
+ var startsWithFactory = (config) => {
15067
+ const c = config;
15068
+ return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
15069
+ const result = runStartsWithAssertion(ctx.candidate, c.value);
15070
+ return {
15071
+ score: result.score,
15072
+ verdict: result.score === 1 ? "pass" : "fail",
15073
+ hits: result.hits,
15074
+ misses: result.misses,
15075
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15076
+ expectedAspectCount: 1
15077
+ };
15078
+ });
15079
+ };
15080
+ var endsWithFactory = (config) => {
15081
+ const c = config;
15082
+ return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
15083
+ const result = runEndsWithAssertion(ctx.candidate, c.value);
15084
+ return {
15085
+ score: result.score,
15086
+ verdict: result.score === 1 ? "pass" : "fail",
15087
+ hits: result.hits,
15088
+ misses: result.misses,
15089
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
15090
+ expectedAspectCount: 1
15091
+ };
15092
+ });
15093
+ };
14710
15094
  function createBuiltinRegistry() {
14711
15095
  const registry = new EvaluatorRegistry();
14712
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
15096
+ registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
14713
15097
  return registry;
14714
15098
  }
14715
15099
 
@@ -15053,37 +15437,255 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
15053
15437
  }
15054
15438
  }
15055
15439
 
15056
- // src/evaluation/workspace/resolve.ts
15440
+ // src/evaluation/workspace/repo-manager.ts
15441
+ var import_node_child_process7 = require("child_process");
15442
+ var import_node_crypto8 = require("crypto");
15443
+ var import_node_fs11 = require("fs");
15057
15444
  var import_promises27 = require("fs/promises");
15445
+ var import_node_os7 = __toESM(require("os"), 1);
15058
15446
  var import_node_path38 = __toESM(require("path"), 1);
15447
+ var import_node_util5 = require("util");
15448
+ var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
15449
+ var DEFAULT_CACHE_DIR = import_node_path38.default.join(import_node_os7.default.homedir(), ".agentv", "git-cache");
15450
+ var DEFAULT_TIMEOUT_MS2 = 3e5;
15451
+ var LOCK_TIMEOUT_MS = 6e4;
15452
+ function gitEnv() {
15453
+ const env = { ...process.env };
15454
+ for (const key of Object.keys(env)) {
15455
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
15456
+ delete env[key];
15457
+ }
15458
+ }
15459
+ return {
15460
+ ...env,
15461
+ GIT_TERMINAL_PROMPT: "0",
15462
+ GIT_ASKPASS: "",
15463
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
15464
+ };
15465
+ }
15466
+ function cacheKey(source) {
15467
+ const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
15468
+ return (0, import_node_crypto8.createHash)("sha256").update(raw).digest("hex");
15469
+ }
15470
+ function getSourceUrl(source) {
15471
+ return source.type === "git" ? source.url : source.path;
15472
+ }
15473
+ async function git(args, opts) {
15474
+ const { stdout } = await execFileAsync("git", args, {
15475
+ cwd: opts?.cwd,
15476
+ timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
15477
+ env: gitEnv(),
15478
+ maxBuffer: 50 * 1024 * 1024
15479
+ // 50MB
15480
+ });
15481
+ return stdout.trim();
15482
+ }
15483
+ async function acquireLock(lockPath) {
15484
+ const start = Date.now();
15485
+ while (Date.now() - start < LOCK_TIMEOUT_MS) {
15486
+ try {
15487
+ await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
15488
+ return;
15489
+ } catch (err) {
15490
+ if (err.code === "EEXIST") {
15491
+ await new Promise((r) => setTimeout(r, 200));
15492
+ continue;
15493
+ }
15494
+ throw err;
15495
+ }
15496
+ }
15497
+ throw new Error(`Timed out waiting for lock: ${lockPath}`);
15498
+ }
15499
+ async function releaseLock(lockPath) {
15500
+ try {
15501
+ await (0, import_promises27.unlink)(lockPath);
15502
+ } catch {
15503
+ }
15504
+ }
15505
+ var RepoManager = class {
15506
+ cacheDir;
15507
+ constructor(cacheDir) {
15508
+ this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
15509
+ }
15510
+ /**
15511
+ * Ensure a bare mirror cache exists for the given source.
15512
+ * Creates on first access, fetches updates on subsequent calls.
15513
+ * Returns the absolute path to the cache directory.
15514
+ */
15515
+ async ensureCache(source, depth) {
15516
+ const key = cacheKey(source);
15517
+ const cachePath = import_node_path38.default.join(this.cacheDir, key);
15518
+ const lockPath = `${cachePath}.lock`;
15519
+ await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
15520
+ await acquireLock(lockPath);
15521
+ try {
15522
+ if ((0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"))) {
15523
+ const fetchArgs = ["fetch", "--prune"];
15524
+ if (depth) {
15525
+ fetchArgs.push("--depth", String(depth));
15526
+ }
15527
+ await git(fetchArgs, { cwd: cachePath });
15528
+ } else {
15529
+ const cloneArgs = ["clone", "--mirror", "--bare"];
15530
+ if (depth) {
15531
+ cloneArgs.push("--depth", String(depth));
15532
+ }
15533
+ const sourceUrl = getSourceUrl(source);
15534
+ const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
15535
+ cloneArgs.push(cloneUrl, cachePath);
15536
+ await git(cloneArgs);
15537
+ }
15538
+ } finally {
15539
+ await releaseLock(lockPath);
15540
+ }
15541
+ return cachePath;
15542
+ }
15543
+ /**
15544
+ * Clone a repo from cache into the workspace at the configured path.
15545
+ * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
15546
+ */
15547
+ async materialize(repo, workspacePath) {
15548
+ const targetDir = import_node_path38.default.join(workspacePath, repo.path);
15549
+ const cachePath = await this.ensureCache(repo.source, repo.clone?.depth);
15550
+ const cloneArgs = ["clone"];
15551
+ if (repo.clone?.depth) {
15552
+ cloneArgs.push("--depth", String(repo.clone.depth));
15553
+ }
15554
+ if (repo.clone?.filter) {
15555
+ cloneArgs.push("--filter", repo.clone.filter);
15556
+ }
15557
+ cloneArgs.push("--no-checkout");
15558
+ const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
15559
+ cloneArgs.push(cloneUrl, targetDir);
15560
+ await git(cloneArgs);
15561
+ if (repo.clone?.sparse?.length) {
15562
+ await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
15563
+ await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
15564
+ }
15565
+ const ref = repo.checkout?.ref ?? "HEAD";
15566
+ const resolve = repo.checkout?.resolve ?? "remote";
15567
+ let resolvedSha;
15568
+ if (resolve === "remote" && repo.source.type === "git") {
15569
+ const url = getSourceUrl(repo.source);
15570
+ try {
15571
+ const lsOutput = await git(["ls-remote", url, ref]);
15572
+ const match = lsOutput.split(" ")[0];
15573
+ if (!match) {
15574
+ throw new Error(`Ref '${ref}' not found on remote ${url}`);
15575
+ }
15576
+ resolvedSha = match;
15577
+ } catch (err) {
15578
+ if (err instanceof Error && err.message.includes("not found")) throw err;
15579
+ resolvedSha = ref;
15580
+ }
15581
+ } else {
15582
+ resolvedSha = ref;
15583
+ }
15584
+ await git(["checkout", resolvedSha], { cwd: targetDir });
15585
+ const ancestor = repo.checkout?.ancestor ?? 0;
15586
+ if (ancestor > 0) {
15587
+ try {
15588
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
15589
+ await git(["checkout", ancestorSha], { cwd: targetDir });
15590
+ } catch {
15591
+ if (repo.clone?.depth) {
15592
+ await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
15593
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
15594
+ await git(["checkout", ancestorSha], { cwd: targetDir });
15595
+ } else {
15596
+ throw new Error(
15597
+ `Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
15598
+ );
15599
+ }
15600
+ }
15601
+ }
15602
+ }
15603
+ /** Materialize all repos into the workspace. */
15604
+ async materializeAll(repos, workspacePath) {
15605
+ for (const repo of repos) {
15606
+ await this.materialize(repo, workspacePath);
15607
+ }
15608
+ }
15609
+ /** Reset repos in workspace to their checkout state. */
15610
+ async reset(repos, workspacePath, strategy) {
15611
+ if (strategy === "recreate") {
15612
+ for (const repo of repos) {
15613
+ const targetDir = import_node_path38.default.join(workspacePath, repo.path);
15614
+ await (0, import_promises27.rm)(targetDir, { recursive: true, force: true });
15615
+ }
15616
+ await this.materializeAll(repos, workspacePath);
15617
+ return;
15618
+ }
15619
+ for (const repo of repos) {
15620
+ const targetDir = import_node_path38.default.join(workspacePath, repo.path);
15621
+ await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
15622
+ await git(["clean", "-fd"], { cwd: targetDir });
15623
+ }
15624
+ }
15625
+ /**
15626
+ * Seed the cache from a local repository, setting the remote to a given URL.
15627
+ * Useful for avoiding slow network clones when a local clone already exists.
15628
+ */
15629
+ async seedCache(localPath, remoteUrl, opts) {
15630
+ const source = { type: "git", url: remoteUrl };
15631
+ const key = cacheKey(source);
15632
+ const cachePath = import_node_path38.default.join(this.cacheDir, key);
15633
+ const lockPath = `${cachePath}.lock`;
15634
+ await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
15635
+ await acquireLock(lockPath);
15636
+ try {
15637
+ if ((0, import_node_fs11.existsSync)(import_node_path38.default.join(cachePath, "HEAD"))) {
15638
+ if (!opts?.force) {
15639
+ throw new Error(
15640
+ `Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
15641
+ );
15642
+ }
15643
+ await (0, import_promises27.rm)(cachePath, { recursive: true, force: true });
15644
+ }
15645
+ await git(["clone", "--mirror", "--bare", localPath, cachePath]);
15646
+ await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
15647
+ } finally {
15648
+ await releaseLock(lockPath);
15649
+ }
15650
+ return cachePath;
15651
+ }
15652
+ /** Remove the entire cache directory. */
15653
+ async cleanCache() {
15654
+ await (0, import_promises27.rm)(this.cacheDir, { recursive: true, force: true });
15655
+ }
15656
+ };
15657
+
15658
+ // src/evaluation/workspace/resolve.ts
15659
+ var import_promises28 = require("fs/promises");
15660
+ var import_node_path39 = __toESM(require("path"), 1);
15059
15661
  async function resolveWorkspaceTemplate(templatePath) {
15060
15662
  if (!templatePath) {
15061
15663
  return void 0;
15062
15664
  }
15063
- const resolved = import_node_path38.default.resolve(templatePath);
15064
- const stats = await (0, import_promises27.stat)(resolved);
15665
+ const resolved = import_node_path39.default.resolve(templatePath);
15666
+ const stats = await (0, import_promises28.stat)(resolved);
15065
15667
  if (stats.isFile()) {
15066
15668
  return {
15067
- dir: import_node_path38.default.dirname(resolved),
15669
+ dir: import_node_path39.default.dirname(resolved),
15068
15670
  workspaceFile: resolved
15069
15671
  };
15070
15672
  }
15071
15673
  if (!stats.isDirectory()) {
15072
15674
  throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
15073
15675
  }
15074
- const entries = await (0, import_promises27.readdir)(resolved);
15676
+ const entries = await (0, import_promises28.readdir)(resolved);
15075
15677
  const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
15076
15678
  if (workspaceFiles.length === 1) {
15077
15679
  return {
15078
15680
  dir: resolved,
15079
- workspaceFile: import_node_path38.default.join(resolved, workspaceFiles[0])
15681
+ workspaceFile: import_node_path39.default.join(resolved, workspaceFiles[0])
15080
15682
  };
15081
15683
  }
15082
15684
  if (workspaceFiles.length > 1) {
15083
15685
  const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
15084
15686
  return {
15085
15687
  dir: resolved,
15086
- workspaceFile: conventionFile ? import_node_path38.default.join(resolved, conventionFile) : void 0
15688
+ workspaceFile: conventionFile ? import_node_path39.default.join(resolved, conventionFile) : void 0
15087
15689
  };
15088
15690
  }
15089
15691
  return { dir: resolved };
@@ -15158,7 +15760,7 @@ async function runEvaluation(options) {
15158
15760
  );
15159
15761
  useCache = false;
15160
15762
  }
15161
- const evalRunId = (0, import_node_crypto8.randomUUID)();
15763
+ const evalRunId = (0, import_node_crypto9.randomUUID)();
15162
15764
  const evalCases = preloadedEvalCases ?? await loadTests(evalFilePath, repoRoot, { verbose, filter });
15163
15765
  const filteredEvalCases = filterEvalCases(evalCases, filter);
15164
15766
  if (filteredEvalCases.length === 0) {
@@ -15205,6 +15807,11 @@ async function runEvaluation(options) {
15205
15807
  }
15206
15808
  return getOrCreateProvider(resolvedJudge);
15207
15809
  };
15810
+ if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
15811
+ throw new Error(
15812
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
15813
+ );
15814
+ }
15208
15815
  const targetResolver = (name) => {
15209
15816
  const resolved = resolveTargetByName(name);
15210
15817
  if (!resolved) {
@@ -15218,7 +15825,7 @@ async function runEvaluation(options) {
15218
15825
  ];
15219
15826
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
15220
15827
  const typeRegistry = createBuiltinRegistry();
15221
- const discoveryBaseDir = evalFilePath ? import_node_path39.default.dirname(import_node_path39.default.resolve(evalFilePath)) : process.cwd();
15828
+ const discoveryBaseDir = evalFilePath ? import_node_path40.default.dirname(import_node_path40.default.resolve(evalFilePath)) : process.cwd();
15222
15829
  await discoverAssertions(typeRegistry, discoveryBaseDir);
15223
15830
  const providerRegistry = createBuiltinProviderRegistry();
15224
15831
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -15273,7 +15880,8 @@ async function runEvaluation(options) {
15273
15880
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
15274
15881
  const workspaceTemplate = resolvedTemplate?.dir;
15275
15882
  const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
15276
- const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all);
15883
+ const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
15884
+ const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
15277
15885
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
15278
15886
  const workers = hasSharedWorkspace ? 1 : requestedWorkers;
15279
15887
  if (hasSharedWorkspace && requestedWorkers > 1) {
@@ -15292,9 +15900,22 @@ async function runEvaluation(options) {
15292
15900
  const message = error instanceof Error ? error.message : String(error);
15293
15901
  throw new Error(`Failed to create shared workspace: ${message}`);
15294
15902
  }
15295
- } else if (suiteWorkspace?.before_all) {
15903
+ } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
15296
15904
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
15297
- await (0, import_promises28.mkdir)(sharedWorkspacePath, { recursive: true });
15905
+ await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
15906
+ }
15907
+ const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
15908
+ if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
15909
+ try {
15910
+ await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
15911
+ } catch (error) {
15912
+ const message = error instanceof Error ? error.message : String(error);
15913
+ if (sharedWorkspacePath) {
15914
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
15915
+ });
15916
+ }
15917
+ throw new Error(`Failed to materialize repos: ${message}`);
15918
+ }
15298
15919
  }
15299
15920
  if (sharedWorkspacePath && suiteWorkspace?.before_all) {
15300
15921
  const scriptContext = {
@@ -15385,7 +16006,8 @@ async function runEvaluation(options) {
15385
16006
  sharedBaselineCommit,
15386
16007
  suiteWorkspaceFile,
15387
16008
  streamCallbacks,
15388
- typeRegistry
16009
+ typeRegistry,
16010
+ repoManager
15389
16011
  };
15390
16012
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
15391
16013
  if (totalBudgetUsd !== void 0) {
@@ -15660,15 +16282,16 @@ async function runEvalCase(options) {
15660
16282
  sharedWorkspacePath,
15661
16283
  sharedBaselineCommit,
15662
16284
  suiteWorkspaceFile,
15663
- typeRegistry: providedTypeRegistry
16285
+ typeRegistry: providedTypeRegistry,
16286
+ repoManager
15664
16287
  } = options;
15665
16288
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
15666
16289
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
15667
16290
  const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
15668
- const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
16291
+ const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
15669
16292
  let cachedResponse;
15670
- if (cacheKey && cache) {
15671
- cachedResponse = await cache.get(cacheKey);
16293
+ if (cacheKey2 && cache) {
16294
+ cachedResponse = await cache.get(cacheKey2);
15672
16295
  }
15673
16296
  const nowFn = now ?? (() => /* @__PURE__ */ new Date());
15674
16297
  let workspacePath = sharedWorkspacePath;
@@ -15697,9 +16320,25 @@ async function runEvalCase(options) {
15697
16320
  );
15698
16321
  }
15699
16322
  }
15700
- if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
16323
+ if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
15701
16324
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
15702
- await (0, import_promises28.mkdir)(workspacePath, { recursive: true });
16325
+ await (0, import_promises29.mkdir)(workspacePath, { recursive: true });
16326
+ }
16327
+ if (evalCase.workspace?.repos?.length && workspacePath) {
16328
+ const perCaseRepoManager = new RepoManager();
16329
+ try {
16330
+ await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
16331
+ } catch (error) {
16332
+ const message = error instanceof Error ? error.message : String(error);
16333
+ return buildErrorResult(
16334
+ evalCase,
16335
+ target.name,
16336
+ nowFn(),
16337
+ new Error(`Failed to materialize repos: ${message}`),
16338
+ promptInputs,
16339
+ provider
16340
+ );
16341
+ }
15703
16342
  }
15704
16343
  if (workspacePath && evalCase.workspace?.before_all) {
15705
16344
  const scriptContext = {
@@ -15823,8 +16462,8 @@ async function runEvalCase(options) {
15823
16462
  }
15824
16463
  return errorResult;
15825
16464
  }
15826
- if (cacheKey && cache && !cachedResponse) {
15827
- await cache.set(cacheKey, providerResponse);
16465
+ if (cacheKey2 && cache && !cachedResponse) {
16466
+ await cache.set(cacheKey2, providerResponse);
15828
16467
  }
15829
16468
  const output = providerResponse.output;
15830
16469
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
@@ -15852,6 +16491,16 @@ async function runEvalCase(options) {
15852
16491
  }
15853
16492
  }
15854
16493
  const providerError = extractProviderError(providerResponse);
16494
+ if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
16495
+ try {
16496
+ await repoManager.reset(
16497
+ evalCase.workspace.repos,
16498
+ workspacePath,
16499
+ evalCase.workspace.reset.strategy
16500
+ );
16501
+ } catch {
16502
+ }
16503
+ }
15855
16504
  if (workspacePath && evalCase.workspace?.after_each) {
15856
16505
  const scriptContext = {
15857
16506
  workspacePath,
@@ -16216,7 +16865,7 @@ async function runEvaluatorList(options) {
16216
16865
  fileChanges,
16217
16866
  workspacePath
16218
16867
  };
16219
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path39.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
16868
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path40.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
16220
16869
  const dispatchContext = {
16221
16870
  judgeProvider,
16222
16871
  targetResolver,
@@ -16306,8 +16955,9 @@ async function runEvaluatorList(options) {
16306
16955
  const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
16307
16956
  return entry.score.score < minScore;
16308
16957
  });
16309
- const aggregateScore = hasRequiredFailure ? 0 : scored.length > 0 ? computeWeightedMean(
16310
- scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
16958
+ const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
16959
+ const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
16960
+ scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
16311
16961
  ) : 0;
16312
16962
  const hits = scored.flatMap((entry) => entry.score.hits);
16313
16963
  const misses = scored.flatMap((entry) => entry.score.misses);
@@ -16447,7 +17097,7 @@ function extractProviderError(response) {
16447
17097
  return trimmed.length > 0 ? trimmed : void 0;
16448
17098
  }
16449
17099
  function createCacheKey(provider, target, evalCase, promptInputs) {
16450
- const hash = (0, import_node_crypto8.createHash)("sha256");
17100
+ const hash = (0, import_node_crypto9.createHash)("sha256");
16451
17101
  hash.update(provider.id);
16452
17102
  hash.update(target.name);
16453
17103
  hash.update(evalCase.id);
@@ -16515,8 +17165,8 @@ function computeWeightedMean(entries) {
16515
17165
  }
16516
17166
 
16517
17167
  // src/evaluation/evaluate.ts
16518
- var import_node_fs11 = require("fs");
16519
- var import_node_path40 = __toESM(require("path"), 1);
17168
+ var import_node_fs12 = require("fs");
17169
+ var import_node_path41 = __toESM(require("path"), 1);
16520
17170
  async function evaluate(config) {
16521
17171
  const startTime = Date.now();
16522
17172
  if (config.tests && config.specFile) {
@@ -16538,13 +17188,13 @@ async function evaluate(config) {
16538
17188
  let evalCases;
16539
17189
  let testFilePath;
16540
17190
  if (config.specFile) {
16541
- testFilePath = import_node_path40.default.resolve(config.specFile);
17191
+ testFilePath = import_node_path41.default.resolve(config.specFile);
16542
17192
  evalCases = await loadTests(testFilePath, repoRoot, {
16543
17193
  verbose: config.verbose,
16544
17194
  filter: config.filter
16545
17195
  });
16546
17196
  } else {
16547
- testFilePath = import_node_path40.default.join(process.cwd(), "__programmatic__.yaml");
17197
+ testFilePath = import_node_path41.default.join(process.cwd(), "__programmatic__.yaml");
16548
17198
  evalCases = (config.tests ?? []).map((test) => {
16549
17199
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
16550
17200
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
@@ -16635,11 +17285,11 @@ function computeSummary(results, durationMs) {
16635
17285
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
16636
17286
  async function discoverDefaultTarget(repoRoot) {
16637
17287
  const cwd = process.cwd();
16638
- const chain = buildDirectoryChain2(import_node_path40.default.join(cwd, "_placeholder"), repoRoot);
17288
+ const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
16639
17289
  for (const dir of chain) {
16640
17290
  for (const candidate of TARGET_FILE_CANDIDATES) {
16641
- const targetsPath = import_node_path40.default.join(dir, candidate);
16642
- if (!(0, import_node_fs11.existsSync)(targetsPath)) continue;
17291
+ const targetsPath = import_node_path41.default.join(dir, candidate);
17292
+ if (!(0, import_node_fs12.existsSync)(targetsPath)) continue;
16643
17293
  try {
16644
17294
  const definitions = await readTargetDefinitions(targetsPath);
16645
17295
  const defaultTarget = definitions.find((d) => d.name === "default");
@@ -16653,11 +17303,11 @@ async function discoverDefaultTarget(repoRoot) {
16653
17303
  async function loadEnvHierarchy(repoRoot) {
16654
17304
  const { readFileSync: readFileSync2 } = await import("fs");
16655
17305
  const cwd = process.cwd();
16656
- const chain = buildDirectoryChain2(import_node_path40.default.join(cwd, "_placeholder"), repoRoot);
17306
+ const chain = buildDirectoryChain2(import_node_path41.default.join(cwd, "_placeholder"), repoRoot);
16657
17307
  const envFiles = [];
16658
17308
  for (const dir of chain) {
16659
- const envPath = import_node_path40.default.join(dir, ".env");
16660
- if ((0, import_node_fs11.existsSync)(envPath)) envFiles.push(envPath);
17309
+ const envPath = import_node_path41.default.join(dir, ".env");
17310
+ if ((0, import_node_fs12.existsSync)(envPath)) envFiles.push(envPath);
16661
17311
  }
16662
17312
  for (let i = envFiles.length - 1; i >= 0; i--) {
16663
17313
  try {
@@ -16727,12 +17377,12 @@ var CONFIG_FILE_NAMES = [
16727
17377
  ".agentv/config.js"
16728
17378
  ];
16729
17379
  async function loadTsConfig(projectRoot) {
16730
- const { existsSync: existsSync3 } = await import("fs");
17380
+ const { existsSync: existsSync4 } = await import("fs");
16731
17381
  const { pathToFileURL } = await import("url");
16732
17382
  const { join: join2 } = await import("path");
16733
17383
  for (const fileName of CONFIG_FILE_NAMES) {
16734
17384
  const filePath = join2(projectRoot, fileName);
16735
- if (!existsSync3(filePath)) {
17385
+ if (!existsSync4(filePath)) {
16736
17386
  continue;
16737
17387
  }
16738
17388
  try {
@@ -16829,8 +17479,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
16829
17479
  }
16830
17480
 
16831
17481
  // src/evaluation/cache/response-cache.ts
16832
- var import_promises29 = require("fs/promises");
16833
- var import_node_path41 = __toESM(require("path"), 1);
17482
+ var import_promises30 = require("fs/promises");
17483
+ var import_node_path42 = __toESM(require("path"), 1);
16834
17484
  var DEFAULT_CACHE_PATH = ".agentv/cache";
16835
17485
  var ResponseCache = class {
16836
17486
  cachePath;
@@ -16840,7 +17490,7 @@ var ResponseCache = class {
16840
17490
  async get(key) {
16841
17491
  const filePath = this.keyToPath(key);
16842
17492
  try {
16843
- const data = await (0, import_promises29.readFile)(filePath, "utf8");
17493
+ const data = await (0, import_promises30.readFile)(filePath, "utf8");
16844
17494
  return JSON.parse(data);
16845
17495
  } catch {
16846
17496
  return void 0;
@@ -16848,13 +17498,13 @@ var ResponseCache = class {
16848
17498
  }
16849
17499
  async set(key, value) {
16850
17500
  const filePath = this.keyToPath(key);
16851
- const dir = import_node_path41.default.dirname(filePath);
16852
- await (0, import_promises29.mkdir)(dir, { recursive: true });
16853
- await (0, import_promises29.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
17501
+ const dir = import_node_path42.default.dirname(filePath);
17502
+ await (0, import_promises30.mkdir)(dir, { recursive: true });
17503
+ await (0, import_promises30.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
16854
17504
  }
16855
17505
  keyToPath(key) {
16856
17506
  const prefix = key.slice(0, 2);
16857
- return import_node_path41.default.join(this.cachePath, prefix, `${key}.json`);
17507
+ return import_node_path42.default.join(this.cachePath, prefix, `${key}.json`);
16858
17508
  }
16859
17509
  };
16860
17510
  function shouldEnableCache(params) {
@@ -17332,6 +17982,7 @@ function createAgentKernel() {
17332
17982
  OtelTraceExporter,
17333
17983
  OtlpJsonFileExporter,
17334
17984
  ProviderRegistry,
17985
+ RepoManager,
17335
17986
  ResponseCache,
17336
17987
  SimpleTraceFileExporter,
17337
17988
  TEST_MESSAGE_ROLES,
@@ -17417,12 +18068,19 @@ function createAgentKernel() {
17417
18068
  resolveTargetDefinition,
17418
18069
  resolveWorkspaceTemplate,
17419
18070
  rubricEvaluationSchema,
18071
+ runContainsAllAssertion,
18072
+ runContainsAnyAssertion,
17420
18073
  runContainsAssertion,
18074
+ runEndsWithAssertion,
17421
18075
  runEqualsAssertion,
17422
18076
  runEvalCase,
17423
18077
  runEvaluation,
18078
+ runIcontainsAllAssertion,
18079
+ runIcontainsAnyAssertion,
18080
+ runIcontainsAssertion,
17424
18081
  runIsJsonAssertion,
17425
18082
  runRegexAssertion,
18083
+ runStartsWithAssertion,
17426
18084
  scoreToVerdict,
17427
18085
  shouldEnableCache,
17428
18086
  shouldSkipCacheForTemperature,