@agentv/core 4.10.0 → 4.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -25,10 +25,17 @@ import {
25
25
  resolveDelegatedTargetDefinition,
26
26
  resolveFileReference,
27
27
  resolveTargetDefinition
28
- } from "./chunk-BWHUWLGW.js";
28
+ } from "./chunk-5POFMJJ7.js";
29
+ import {
30
+ execFileWithStdin,
31
+ execShellWithStdin
32
+ } from "./chunk-3WGHC7LC.js";
29
33
  import {
30
34
  AgentvProvider
31
35
  } from "./chunk-PRNXHNLF.js";
36
+ import {
37
+ DockerWorkspaceProvider
38
+ } from "./chunk-SDIANPEY.js";
32
39
  import {
33
40
  OtlpJsonFileExporter
34
41
  } from "./chunk-KPSI5CSL.js";
@@ -152,10 +159,10 @@ function mergeExecutionMetrics(computed, metrics) {
152
159
  }
153
160
 
154
161
  // src/evaluation/yaml-parser.ts
155
- import { readFile as readFile7 } from "node:fs/promises";
162
+ import { readFile as readFile8 } from "node:fs/promises";
156
163
  import path8 from "node:path";
157
164
  import micromatch2 from "micromatch";
158
- import { parse as parse2 } from "yaml";
165
+ import { parse as parse3 } from "yaml";
159
166
 
160
167
  // src/evaluation/input-message-utils.ts
161
168
  function flattenInputMessages(messages) {
@@ -441,10 +448,12 @@ async function loadConfig(evalFilePath, repoRoot) {
441
448
  parsed.execution,
442
449
  configPath
443
450
  );
451
+ const results = parseResultsConfig(parsed.results, configPath);
444
452
  return {
445
453
  required_version: requiredVersion,
446
454
  eval_patterns: evalPatterns,
447
- execution: executionDefaults
455
+ execution: executionDefaults,
456
+ results
448
457
  };
449
458
  } catch (error) {
450
459
  logWarning(
@@ -679,163 +688,74 @@ function parseExecutionDefaults(raw, configPath) {
679
688
  }
680
689
  return Object.keys(result).length > 0 ? result : void 0;
681
690
  }
691
+ function parseResultsConfig(raw, configPath) {
692
+ if (raw === void 0 || raw === null) {
693
+ return void 0;
694
+ }
695
+ if (typeof raw !== "object" || Array.isArray(raw)) {
696
+ logWarning(`Invalid results in ${configPath}, expected object`);
697
+ return void 0;
698
+ }
699
+ const obj = raw;
700
+ const exportConfig = parseResultsExportConfig(obj.export, configPath);
701
+ if (!exportConfig) {
702
+ return void 0;
703
+ }
704
+ return { export: exportConfig };
705
+ }
706
+ function parseResultsExportConfig(raw, configPath) {
707
+ if (raw === void 0 || raw === null) {
708
+ return void 0;
709
+ }
710
+ if (typeof raw !== "object" || Array.isArray(raw)) {
711
+ logWarning(`Invalid results.export in ${configPath}, expected object`);
712
+ return void 0;
713
+ }
714
+ const obj = raw;
715
+ const repo = typeof obj.repo === "string" ? obj.repo.trim() : "";
716
+ const exportPath = typeof obj.path === "string" ? obj.path.trim() : "";
717
+ if (!repo) {
718
+ logWarning(`Invalid results.export.repo in ${configPath}, expected non-empty string`);
719
+ return void 0;
720
+ }
721
+ if (!exportPath) {
722
+ logWarning(`Invalid results.export.path in ${configPath}, expected non-empty string`);
723
+ return void 0;
724
+ }
725
+ if (obj.auto_push !== void 0 && typeof obj.auto_push !== "boolean") {
726
+ logWarning(`Invalid results.export.auto_push in ${configPath}, expected boolean`);
727
+ return void 0;
728
+ }
729
+ let branchPrefix;
730
+ if (obj.branch_prefix !== void 0) {
731
+ if (typeof obj.branch_prefix !== "string" || obj.branch_prefix.trim().length === 0) {
732
+ logWarning(
733
+ `Invalid results.export.branch_prefix in ${configPath}, expected non-empty string`
734
+ );
735
+ return void 0;
736
+ }
737
+ branchPrefix = obj.branch_prefix.trim();
738
+ }
739
+ return {
740
+ repo,
741
+ path: exportPath,
742
+ ...typeof obj.auto_push === "boolean" && { auto_push: obj.auto_push },
743
+ ...branchPrefix && { branch_prefix: branchPrefix }
744
+ };
745
+ }
682
746
  function logWarning(message) {
683
747
  console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET2}`);
684
748
  }
685
749
 
686
750
  // src/evaluation/loaders/evaluator-parser.ts
751
+ import { readFile as readFile5 } from "node:fs/promises";
687
752
  import path5 from "node:path";
753
+ import { parse as parse2 } from "yaml";
688
754
 
689
755
  // src/evaluation/content-preprocessor.ts
690
756
  import { readFile as readFile3 } from "node:fs/promises";
691
757
  import path4 from "node:path";
692
758
  import { fileURLToPath as fileURLToPath2 } from "node:url";
693
-
694
- // src/runtime/exec.ts
695
- function shellEscapePath(value) {
696
- if (process.platform === "win32") {
697
- return `"${value.replaceAll('"', '""')}"`;
698
- }
699
- return `'${value.replaceAll("'", `'"'"'`)}'`;
700
- }
701
- async function execFileWithStdin(argv, stdinPayload, options = {}) {
702
- if (argv.length === 0) {
703
- throw new Error("Executable argv must include at least one entry");
704
- }
705
- if (typeof Bun !== "undefined") {
706
- return execFileWithStdinBun(argv, stdinPayload, options);
707
- }
708
- return execFileWithStdinNode(argv, stdinPayload, options);
709
- }
710
- async function execFileWithStdinBun(argv, stdinPayload, options) {
711
- const command = [...argv];
712
- const encoder = new TextEncoder();
713
- const proc = Bun.spawn(command, {
714
- cwd: options.cwd,
715
- stdin: encoder.encode(stdinPayload),
716
- stdout: "pipe",
717
- stderr: "pipe",
718
- // Merge additional env vars with process.env
719
- env: options.env ? { ...process.env, ...options.env } : process.env
720
- });
721
- let timedOut = false;
722
- const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
723
- timedOut = true;
724
- proc.kill("SIGKILL");
725
- }, options.timeoutMs) : void 0;
726
- try {
727
- const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
728
- const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
729
- const [stdout, stderr, exitCode] = await Promise.all([
730
- stdoutPromise,
731
- stderrPromise,
732
- proc.exited
733
- ]);
734
- if (timedOut) {
735
- throw new Error(`Process timed out after ${options.timeoutMs}ms`);
736
- }
737
- return {
738
- stdout: stdout.replace(/\r\n/g, "\n"),
739
- stderr: stderr.replace(/\r\n/g, "\n"),
740
- exitCode
741
- };
742
- } finally {
743
- if (timeout !== void 0) {
744
- clearTimeout(timeout);
745
- }
746
- }
747
- }
748
- async function execFileWithStdinNode(argv, stdinPayload, options) {
749
- const { spawn: spawn5 } = await import("node:child_process");
750
- return new Promise((resolve, reject) => {
751
- const [cmd, ...args] = argv;
752
- const child = spawn5(cmd, args, {
753
- cwd: options.cwd,
754
- stdio: ["pipe", "pipe", "pipe"],
755
- // Merge additional env vars with process.env
756
- env: options.env ? { ...process.env, ...options.env } : process.env
757
- });
758
- const stdoutChunks = [];
759
- const stderrChunks = [];
760
- child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
761
- child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
762
- let timedOut = false;
763
- const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
764
- timedOut = true;
765
- child.kill("SIGKILL");
766
- }, options.timeoutMs) : void 0;
767
- child.on("error", (error) => {
768
- if (timeout !== void 0) clearTimeout(timeout);
769
- reject(error);
770
- });
771
- child.on("close", (code) => {
772
- if (timeout !== void 0) clearTimeout(timeout);
773
- if (timedOut) {
774
- reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
775
- return;
776
- }
777
- const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
778
- const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
779
- resolve({
780
- stdout,
781
- stderr,
782
- exitCode: code ?? 0
783
- });
784
- });
785
- if (child.stdin) {
786
- child.stdin.write(stdinPayload);
787
- child.stdin.end();
788
- }
789
- });
790
- }
791
- async function execShellWithStdin(command, stdinPayload, options = {}) {
792
- const { mkdir: mkdir16, readFile: readFile17, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
793
- const { tmpdir: tmpdir3 } = await import("node:os");
794
- const path52 = await import("node:path");
795
- const { randomUUID: randomUUID10 } = await import("node:crypto");
796
- const dir = path52.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
797
- await mkdir16(dir, { recursive: true });
798
- const stdinPath = path52.join(dir, "stdin.txt");
799
- const stdoutPath = path52.join(dir, "stdout.txt");
800
- const stderrPath = path52.join(dir, "stderr.txt");
801
- await writeFile9(stdinPath, stdinPayload, "utf8");
802
- const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
803
- const { spawn: spawn5 } = await import("node:child_process");
804
- try {
805
- const exitCode = await new Promise((resolve, reject) => {
806
- const child = spawn5(wrappedCommand, {
807
- shell: true,
808
- cwd: options.cwd,
809
- stdio: ["ignore", "ignore", "ignore"],
810
- // Merge additional env vars with process.env
811
- env: options.env ? { ...process.env, ...options.env } : process.env
812
- });
813
- const timeout = options.timeoutMs ? setTimeout(() => {
814
- child.kill();
815
- reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
816
- }, options.timeoutMs) : void 0;
817
- child.on("error", (error) => {
818
- if (timeout !== void 0) {
819
- clearTimeout(timeout);
820
- }
821
- reject(error);
822
- });
823
- child.on("exit", (code) => {
824
- if (timeout !== void 0) {
825
- clearTimeout(timeout);
826
- }
827
- resolve(code ?? 0);
828
- });
829
- });
830
- const stdout = (await readFile17(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
831
- const stderr = (await readFile17(stderrPath, "utf8")).replace(/\r\n/g, "\n");
832
- return { stdout, stderr, exitCode };
833
- } finally {
834
- await rm6(dir, { recursive: true, force: true });
835
- }
836
- }
837
-
838
- // src/evaluation/content-preprocessor.ts
839
759
  var MIME_TYPE_ALIASES = {
840
760
  csv: "text/csv",
841
761
  docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -1076,6 +996,7 @@ function validateTemplateVariables(content, source) {
1076
996
  // src/evaluation/loaders/evaluator-parser.ts
1077
997
  var ANSI_YELLOW3 = "\x1B[33m";
1078
998
  var ANSI_RESET4 = "\x1B[0m";
999
+ var MAX_ASSERTION_INCLUDE_DEPTH = 3;
1079
1000
  var PROMPT_FILE_PREFIX = "file://";
1080
1001
  function normalizeEvaluatorType(type) {
1081
1002
  return type.replace(/_/g, "-");
@@ -1108,7 +1029,79 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
1108
1029
  const evaluators = [...parsedCase ?? [], ...parsedRoot ?? []];
1109
1030
  return evaluators.length > 0 ? evaluators : void 0;
1110
1031
  }
1111
- async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
1032
+ function isIncludeEntry(value) {
1033
+ return isJsonObject2(value) && typeof value.include === "string" && Object.keys(value).length === 1;
1034
+ }
1035
+ function isTemplateReference(value) {
1036
+ return !value.startsWith(".") && !value.includes("/") && !value.includes("\\");
1037
+ }
1038
+ async function resolveAssertionTemplateReference(include, searchRoots) {
1039
+ const templateCandidates = isTemplateReference(include) ? [
1040
+ path5.join(".agentv", "templates", `${include}.yaml`),
1041
+ path5.join(".agentv", "templates", `${include}.yml`)
1042
+ ] : [include];
1043
+ const attempted = [];
1044
+ for (const candidate of templateCandidates) {
1045
+ const resolved = await resolveFileReference2(candidate, searchRoots);
1046
+ attempted.push(...resolved.attempted);
1047
+ if (resolved.resolvedPath) {
1048
+ return {
1049
+ displayPath: resolved.displayPath,
1050
+ resolvedPath: resolved.resolvedPath,
1051
+ attempted
1052
+ };
1053
+ }
1054
+ }
1055
+ return {
1056
+ displayPath: templateCandidates[0] ?? include,
1057
+ resolvedPath: "",
1058
+ attempted
1059
+ };
1060
+ }
1061
+ async function loadAssertionTemplateEntries(include, searchRoots, evalId, includeContext) {
1062
+ const nextDepth = includeContext.depth + 1;
1063
+ if (nextDepth > MAX_ASSERTION_INCLUDE_DEPTH) {
1064
+ const chain = [...includeContext.chain, include].join(" -> ");
1065
+ throw new Error(
1066
+ `Assertion template include depth exceeded ${MAX_ASSERTION_INCLUDE_DEPTH} in '${evalId}'. Include chain: ${chain}`
1067
+ );
1068
+ }
1069
+ const resolved = await resolveAssertionTemplateReference(include, searchRoots);
1070
+ if (!resolved.resolvedPath) {
1071
+ const attempted = resolved.attempted.length > 0 ? `
1072
+ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
1073
+ throw new Error(
1074
+ `Assertion template not found in '${evalId}': ${resolved.displayPath}${attempted}`
1075
+ );
1076
+ }
1077
+ if (includeContext.chain.includes(resolved.resolvedPath)) {
1078
+ const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
1079
+ throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
1080
+ }
1081
+ const content = await readFile5(resolved.resolvedPath, "utf8");
1082
+ const parsed = interpolateEnv(parse2(content), process.env);
1083
+ if (!isJsonObject2(parsed)) {
1084
+ throw new Error(
1085
+ `Invalid assertion template file in '${evalId}': ${resolved.resolvedPath} (expected a YAML object with an assertions array)`
1086
+ );
1087
+ }
1088
+ const assertions = parsed.assertions;
1089
+ if (!Array.isArray(assertions)) {
1090
+ throw new Error(
1091
+ `Invalid assertion template file in '${evalId}': ${resolved.resolvedPath} is missing a top-level assertions array`
1092
+ );
1093
+ }
1094
+ const templateDir = path5.dirname(resolved.resolvedPath);
1095
+ const nestedSearchRoots = [
1096
+ templateDir,
1097
+ ...searchRoots.filter((root) => path5.resolve(root) !== templateDir)
1098
+ ];
1099
+ return await expandEvaluatorEntries(assertions, nestedSearchRoots, evalId, {
1100
+ depth: nextDepth,
1101
+ chain: [...includeContext.chain, resolved.resolvedPath]
1102
+ }) ?? [];
1103
+ }
1104
+ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
1112
1105
  if (candidateEvaluators === void 0) {
1113
1106
  return void 0;
1114
1107
  }
@@ -1116,13 +1109,34 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
1116
1109
  logWarning2(`Skipping evaluators for '${evalId}': expected array`);
1117
1110
  return void 0;
1118
1111
  }
1119
- const firstStringIndex = candidateEvaluators.findIndex((e) => typeof e === "string");
1120
- const processedEvaluators = firstStringIndex === -1 ? [...candidateEvaluators] : (() => {
1112
+ const expanded = [];
1113
+ for (const rawEvaluator of candidateEvaluators) {
1114
+ if (isIncludeEntry(rawEvaluator)) {
1115
+ const included = await loadAssertionTemplateEntries(
1116
+ rawEvaluator.include,
1117
+ searchRoots,
1118
+ evalId,
1119
+ includeContext
1120
+ );
1121
+ expanded.push(...included);
1122
+ continue;
1123
+ }
1124
+ expanded.push(rawEvaluator);
1125
+ }
1126
+ return expanded;
1127
+ }
1128
+ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
1129
+ const expandedEvaluators = await expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId);
1130
+ if (!expandedEvaluators) {
1131
+ return void 0;
1132
+ }
1133
+ const firstStringIndex = expandedEvaluators.findIndex((e) => typeof e === "string");
1134
+ const processedEvaluators = firstStringIndex === -1 ? [...expandedEvaluators] : (() => {
1121
1135
  const PLACEHOLDER = Symbol("rubric-placeholder");
1122
1136
  const strings = [];
1123
1137
  const result = [];
1124
1138
  let rubricInserted = false;
1125
- for (const item of candidateEvaluators) {
1139
+ for (const item of expandedEvaluators) {
1126
1140
  if (typeof item === "string") {
1127
1141
  const trimmed = item.trim();
1128
1142
  if (trimmed.length === 0) {
@@ -1337,8 +1351,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
1337
1351
  );
1338
1352
  continue;
1339
1353
  }
1354
+ const expandedMembers = await expandEvaluatorEntries(
1355
+ rawMembers,
1356
+ searchRoots,
1357
+ `${evalId}:${name}`
1358
+ );
1359
+ if (!expandedMembers) {
1360
+ continue;
1361
+ }
1340
1362
  const memberEvaluators = [];
1341
- for (const rawMember of rawMembers) {
1363
+ for (const rawMember of expandedMembers) {
1342
1364
  if (!isJsonObject2(rawMember)) {
1343
1365
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
1344
1366
  continue;
@@ -2664,13 +2686,13 @@ function parseInlineRubrics(rawRubrics) {
2664
2686
  }
2665
2687
 
2666
2688
  // src/evaluation/loaders/jsonl-parser.ts
2667
- import { readFile as readFile6 } from "node:fs/promises";
2689
+ import { readFile as readFile7 } from "node:fs/promises";
2668
2690
  import path7 from "node:path";
2669
2691
  import micromatch from "micromatch";
2670
2692
  import { parse as parseYaml } from "yaml";
2671
2693
 
2672
2694
  // src/evaluation/loaders/message-processor.ts
2673
- import { readFile as readFile5 } from "node:fs/promises";
2695
+ import { readFile as readFile6 } from "node:fs/promises";
2674
2696
  import path6 from "node:path";
2675
2697
 
2676
2698
  // src/evaluation/formatting/segment-formatter.ts
@@ -2787,7 +2809,7 @@ async function processMessages(options) {
2787
2809
  continue;
2788
2810
  }
2789
2811
  try {
2790
- const fileContent = (await readFile5(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
2812
+ const fileContent = (await readFile6(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
2791
2813
  processedContent.push({
2792
2814
  ...cloneJsonObject(rawSegment),
2793
2815
  path: displayPath,
@@ -2828,7 +2850,7 @@ async function processMessages(options) {
2828
2850
  continue;
2829
2851
  }
2830
2852
  try {
2831
- const imageBuffer = await readFile5(resolvedPath);
2853
+ const imageBuffer = await readFile6(resolvedPath);
2832
2854
  const base64 = imageBuffer.toString("base64");
2833
2855
  processedContent.push({
2834
2856
  type: "image",
@@ -2905,7 +2927,7 @@ async function processExpectedMessages(options) {
2905
2927
  continue;
2906
2928
  }
2907
2929
  try {
2908
- const fileContent = (await readFile5(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
2930
+ const fileContent = (await readFile6(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
2909
2931
  processedContent.push({
2910
2932
  type: "file",
2911
2933
  path: displayPath,
@@ -2945,7 +2967,7 @@ async function processExpectedMessages(options) {
2945
2967
  continue;
2946
2968
  }
2947
2969
  try {
2948
- const imageBuffer = await readFile5(resolvedPath);
2970
+ const imageBuffer = await readFile6(resolvedPath);
2949
2971
  const base64 = imageBuffer.toString("base64");
2950
2972
  processedContent.push({
2951
2973
  type: "image",
@@ -3073,7 +3095,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
3073
3095
  return {};
3074
3096
  }
3075
3097
  try {
3076
- const content = await readFile6(sidecarPath, "utf8");
3098
+ const content = await readFile7(sidecarPath, "utf8");
3077
3099
  const parsed = interpolateEnv(parseYaml(content), process.env);
3078
3100
  if (!isJsonObject(parsed)) {
3079
3101
  logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
@@ -3118,7 +3140,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
3118
3140
  const repoRootPath = resolveToAbsolutePath(repoRoot);
3119
3141
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
3120
3142
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
3121
- const rawFile = await readFile6(absoluteTestPath, "utf8");
3143
+ const rawFile = await readFile7(absoluteTestPath, "utf8");
3122
3144
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
3123
3145
  const fallbackSuiteName = path7.basename(absoluteTestPath, ".jsonl") || "eval";
3124
3146
  const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
@@ -3300,11 +3322,13 @@ function parseRepoCheckout(raw) {
3300
3322
  if (!isJsonObject(raw)) return void 0;
3301
3323
  const obj = raw;
3302
3324
  const ref = typeof obj.ref === "string" ? obj.ref : void 0;
3325
+ const baseCommit = typeof obj.base_commit === "string" ? obj.base_commit : void 0;
3303
3326
  const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
3304
3327
  const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
3305
- if (!ref && !resolve && ancestor === void 0) return void 0;
3328
+ if (!ref && !baseCommit && !resolve && ancestor === void 0) return void 0;
3306
3329
  return {
3307
3330
  ...ref !== void 0 && { ref },
3331
+ ...baseCommit !== void 0 && { base_commit: baseCommit },
3308
3332
  ...resolve !== void 0 && { resolve },
3309
3333
  ...ancestor !== void 0 && { ancestor }
3310
3334
  };
@@ -3327,12 +3351,12 @@ function parseRepoConfig(raw) {
3327
3351
  const obj = raw;
3328
3352
  const repoPath = typeof obj.path === "string" ? obj.path : void 0;
3329
3353
  const source = parseRepoSource(obj.source);
3330
- if (!repoPath || !source) return void 0;
3331
3354
  const checkout = parseRepoCheckout(obj.checkout);
3332
3355
  const clone = parseRepoClone(obj.clone);
3356
+ if (!repoPath && !source && !checkout && !clone) return void 0;
3333
3357
  return {
3334
- path: repoPath,
3335
- source,
3358
+ ...repoPath !== void 0 && { path: repoPath },
3359
+ ...source !== void 0 && { source },
3336
3360
  ...checkout !== void 0 && { checkout },
3337
3361
  ...clone !== void 0 && { clone }
3338
3362
  };
@@ -3383,7 +3407,8 @@ ${messageContent}`);
3383
3407
  segmentsByMessage,
3384
3408
  mode
3385
3409
  }) : void 0;
3386
- return { question, chatPrompt };
3410
+ const systemMessage = extractSystemMessage(testCase.input, segmentsByMessage, mode);
3411
+ return { question, chatPrompt, systemMessage };
3387
3412
  }
3388
3413
  function needsRoleMarkers(messages, processedSegmentsByMessage) {
3389
3414
  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
@@ -3397,6 +3422,26 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
3397
3422
  }
3398
3423
  return messagesWithContent > 1;
3399
3424
  }
3425
+ function extractSystemMessage(messages, segmentsByMessage, mode) {
3426
+ const systemParts = [];
3427
+ for (let i = 0; i < messages.length; i++) {
3428
+ if (messages[i].role !== "system") {
3429
+ break;
3430
+ }
3431
+ const segments = segmentsByMessage[i];
3432
+ const contentParts = [];
3433
+ for (const segment of segments) {
3434
+ const formatted = formatSegment(segment, mode);
3435
+ if (formatted) {
3436
+ contentParts.push(formatted);
3437
+ }
3438
+ }
3439
+ if (contentParts.length > 0) {
3440
+ systemParts.push(contentParts.join("\n"));
3441
+ }
3442
+ }
3443
+ return systemParts.length > 0 ? systemParts.join("\n\n") : void 0;
3444
+ }
3400
3445
  function buildChatPromptFromSegments(options) {
3401
3446
  const { messages, segmentsByMessage, systemPrompt, mode = "lm" } = options;
3402
3447
  if (messages.length === 0) {
@@ -3480,8 +3525,8 @@ function resolveTests(suite) {
3480
3525
  async function readTestSuiteMetadata(testFilePath) {
3481
3526
  try {
3482
3527
  const absolutePath = path8.resolve(testFilePath);
3483
- const content = await readFile7(absolutePath, "utf8");
3484
- const parsed = interpolateEnv(parse2(content), process.env);
3528
+ const content = await readFile8(absolutePath, "utf8");
3529
+ const parsed = interpolateEnv(parse3(content), process.env);
3485
3530
  if (!isJsonObject(parsed)) {
3486
3531
  return {};
3487
3532
  }
@@ -3538,8 +3583,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3538
3583
  const repoRootPath = resolveToAbsolutePath(repoRoot);
3539
3584
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
3540
3585
  const config = await loadConfig(absoluteTestPath, repoRootPath);
3541
- const rawFile = await readFile7(absoluteTestPath, "utf8");
3542
- const interpolated = interpolateEnv(parse2(rawFile), process.env);
3586
+ const rawFile = await readFile8(absoluteTestPath, "utf8");
3587
+ const interpolated = interpolateEnv(parse3(rawFile), process.env);
3543
3588
  if (!isJsonObject(interpolated)) {
3544
3589
  throw new Error(`Invalid test file format: ${evalFilePath}`);
3545
3590
  }
@@ -3680,7 +3725,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3680
3725
  const testCase = {
3681
3726
  id,
3682
3727
  suite: suiteName,
3683
- category: options?.category,
3728
+ category: suite.category ?? options?.category,
3684
3729
  conversation_id: conversationId,
3685
3730
  question,
3686
3731
  input: inputMessages,
@@ -3773,11 +3818,11 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
3773
3818
  const workspaceFilePath = path8.resolve(evalFileDir, raw);
3774
3819
  let content;
3775
3820
  try {
3776
- content = await readFile7(workspaceFilePath, "utf8");
3821
+ content = await readFile8(workspaceFilePath, "utf8");
3777
3822
  } catch {
3778
3823
  throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
3779
3824
  }
3780
- const parsed = interpolateEnv(parse2(content), process.env);
3825
+ const parsed = interpolateEnv(parse3(content), process.env);
3781
3826
  if (!isJsonObject(parsed)) {
3782
3827
  throw new Error(
3783
3828
  `Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
@@ -3812,14 +3857,28 @@ function parseWorkspaceConfig(raw, evalFileDir) {
3812
3857
  const explicitMode = obj.mode === "pooled" || obj.mode === "temp" || obj.mode === "static" ? obj.mode : void 0;
3813
3858
  const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
3814
3859
  const mode = explicitMode ?? (workspacePath ? "static" : void 0);
3815
- if (!template && !isolation && !repos && !hooks && !mode && !workspacePath) return void 0;
3860
+ const docker = parseDockerWorkspaceConfig(obj.docker);
3861
+ if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
3862
+ return void 0;
3816
3863
  return {
3817
3864
  ...template !== void 0 && { template },
3818
3865
  ...isolation !== void 0 && { isolation },
3819
3866
  ...repos !== void 0 && { repos },
3820
3867
  ...hooks !== void 0 && { hooks },
3821
3868
  ...mode !== void 0 && { mode },
3822
- ...workspacePath !== void 0 && { path: workspacePath }
3869
+ ...workspacePath !== void 0 && { path: workspacePath },
3870
+ ...docker !== void 0 && { docker }
3871
+ };
3872
+ }
3873
+ function parseDockerWorkspaceConfig(raw) {
3874
+ if (!isJsonObject(raw)) return void 0;
3875
+ const obj = raw;
3876
+ if (typeof obj.image !== "string") return void 0;
3877
+ return {
3878
+ image: obj.image,
3879
+ ...typeof obj.timeout === "number" && { timeout: obj.timeout },
3880
+ ...typeof obj.memory === "string" && { memory: obj.memory },
3881
+ ...typeof obj.cpus === "number" && { cpus: obj.cpus }
3823
3882
  };
3824
3883
  }
3825
3884
  function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
@@ -3848,7 +3907,8 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
3848
3907
  repos: caseLevel.repos ?? suiteLevel.repos,
3849
3908
  ...hasHooks && { hooks: mergedHooks },
3850
3909
  mode: caseLevel.mode ?? suiteLevel.mode,
3851
- path: caseLevel.path ?? suiteLevel.path
3910
+ path: caseLevel.path ?? suiteLevel.path,
3911
+ docker: caseLevel.docker ?? suiteLevel.docker
3852
3912
  };
3853
3913
  }
3854
3914
  function asString5(value) {
@@ -3876,7 +3936,7 @@ ${detailBlock}${ANSI_RESET7}`);
3876
3936
  // src/evaluation/loaders/eval-yaml-transpiler.ts
3877
3937
  import { readFileSync } from "node:fs";
3878
3938
  import path9 from "node:path";
3879
- import { parse as parse3 } from "yaml";
3939
+ import { parse as parse4 } from "yaml";
3880
3940
  function codeGraderInstruction(graderName, description) {
3881
3941
  const desc = description ? ` This grader: ${description}.` : "";
3882
3942
  return `Run \`agentv eval assert ${graderName} --agent-output <agent_output> --agent-input <original_prompt>\` and check the result.${desc} The command accepts --agent-output (the agent's full response text) and --agent-input (the original user prompt). It returns JSON on stdout: {"score": 0-1, "reasoning": "..."}. A score >= 0.5 means pass (exit 0); below 0.5 means fail (exit 1).`;
@@ -4115,7 +4175,7 @@ function transpileEvalYaml(suite, source = "EVAL.yaml") {
4115
4175
  }
4116
4176
  function transpileEvalYamlFile(evalYamlPath) {
4117
4177
  const content = readFileSync(evalYamlPath, "utf8");
4118
- const parsed = parse3(content);
4178
+ const parsed = parse4(content);
4119
4179
  return transpileEvalYaml(parsed, path9.basename(evalYamlPath));
4120
4180
  }
4121
4181
  function getOutputFilenames(result) {
@@ -6596,7 +6656,7 @@ import { arch, platform } from "node:os";
6596
6656
  import path15 from "node:path";
6597
6657
  import { fileURLToPath as fileURLToPath3 } from "node:url";
6598
6658
  function resolvePlatformCliPath() {
6599
- const os3 = platform();
6659
+ const os4 = platform();
6600
6660
  const cpu = arch();
6601
6661
  const platformMap = {
6602
6662
  linux: "linux",
@@ -6607,13 +6667,13 @@ function resolvePlatformCliPath() {
6607
6667
  x64: "x64",
6608
6668
  arm64: "arm64"
6609
6669
  };
6610
- const osPart = platformMap[os3];
6670
+ const osPart = platformMap[os4];
6611
6671
  const archPart = archMap[cpu];
6612
6672
  if (!osPart || !archPart) {
6613
6673
  return void 0;
6614
6674
  }
6615
6675
  const packageName = `@github/copilot-${osPart}-${archPart}`;
6616
- const binaryName = os3 === "win32" ? "copilot.exe" : "copilot";
6676
+ const binaryName = os4 === "win32" ? "copilot.exe" : "copilot";
6617
6677
  try {
6618
6678
  const resolved = import.meta.resolve(`${packageName}/package.json`);
6619
6679
  const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath3(resolved) : resolved;
@@ -7130,7 +7190,7 @@ function summarizeAcpEvent(eventType, data) {
7130
7190
  }
7131
7191
 
7132
7192
  // src/evaluation/providers/copilot-log.ts
7133
- import { readFile as readFile9 } from "node:fs/promises";
7193
+ import { readFile as readFile10 } from "node:fs/promises";
7134
7194
  import { homedir as homedir2 } from "node:os";
7135
7195
  import path18 from "node:path";
7136
7196
 
@@ -7264,7 +7324,7 @@ function parseCopilotEvents(eventsJsonl) {
7264
7324
  }
7265
7325
 
7266
7326
  // src/evaluation/providers/copilot-session-discovery.ts
7267
- import { readFile as readFile8, readdir, stat } from "node:fs/promises";
7327
+ import { readFile as readFile9, readdir, stat } from "node:fs/promises";
7268
7328
  import { homedir } from "node:os";
7269
7329
  import path17 from "node:path";
7270
7330
  import { parse as parseYaml2 } from "yaml";
@@ -7284,7 +7344,7 @@ async function discoverCopilotSessions(opts) {
7284
7344
  const workspacePath = path17.join(sessionDir, "workspace.yaml");
7285
7345
  const eventsPath = path17.join(sessionDir, "events.jsonl");
7286
7346
  try {
7287
- const workspaceContent = await readFile8(workspacePath, "utf8");
7347
+ const workspaceContent = await readFile9(workspacePath, "utf8");
7288
7348
  const workspace = parseYaml2(workspaceContent) ?? {};
7289
7349
  const cwd = String(workspace.cwd ?? "");
7290
7350
  let updatedAt;
@@ -7346,7 +7406,7 @@ var CopilotLogProvider = class {
7346
7406
  const eventsPath = path18.join(sessionDir, "events.jsonl");
7347
7407
  let eventsContent;
7348
7408
  try {
7349
- eventsContent = await readFile9(eventsPath, "utf8");
7409
+ eventsContent = await readFile10(eventsPath, "utf8");
7350
7410
  } catch (err) {
7351
7411
  throw new Error(
7352
7412
  `Failed to read Copilot session transcript at ${eventsPath}: ${err instanceof Error ? err.message : String(err)}`
@@ -9632,7 +9692,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
9632
9692
  }
9633
9693
 
9634
9694
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
9635
- import { readFile as readFile10 } from "node:fs/promises";
9695
+ import { readFile as readFile11 } from "node:fs/promises";
9636
9696
  import path26 from "node:path";
9637
9697
 
9638
9698
  // src/evaluation/providers/vscode/utils/time.ts
@@ -9671,7 +9731,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
9671
9731
  const maxAttempts = 10;
9672
9732
  while (attempts < maxAttempts) {
9673
9733
  try {
9674
- const content = await readFile10(responseFileFinal, { encoding: "utf8" });
9734
+ const content = await readFile11(responseFileFinal, { encoding: "utf8" });
9675
9735
  if (!silent) {
9676
9736
  process.stdout.write(`${content}
9677
9737
  `);
@@ -9728,7 +9788,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
9728
9788
  const maxAttempts = 10;
9729
9789
  while (attempts < maxAttempts) {
9730
9790
  try {
9731
- const content = await readFile10(file, { encoding: "utf8" });
9791
+ const content = await readFile11(file, { encoding: "utf8" });
9732
9792
  if (!silent) {
9733
9793
  process.stdout.write(`${content}
9734
9794
  `);
@@ -9913,7 +9973,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
9913
9973
  }
9914
9974
 
9915
9975
  // src/evaluation/providers/vscode/dispatch/workspaceManager.ts
9916
- import { copyFile, mkdir as mkdir10, readFile as readFile11, readdir as readdir3, stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
9976
+ import { copyFile, mkdir as mkdir10, readFile as readFile12, readdir as readdir3, stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
9917
9977
  import path30 from "node:path";
9918
9978
 
9919
9979
  // src/evaluation/providers/vscode/utils/workspace.ts
@@ -10030,7 +10090,7 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
10030
10090
  if (!stats.isFile()) {
10031
10091
  throw new Error(`workspace template must be a file, not a directory: ${workspaceSrc}`);
10032
10092
  }
10033
- const templateText = await readFile11(workspaceSrc, "utf8");
10093
+ const templateText = await readFile12(workspaceSrc, "utf8");
10034
10094
  workspaceContent = JSON.parse(templateText);
10035
10095
  } else {
10036
10096
  workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
@@ -10893,9 +10953,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
10893
10953
 
10894
10954
  // src/evaluation/providers/targets-file.ts
10895
10955
  import { constants as constants4 } from "node:fs";
10896
- import { access as access4, readFile as readFile12 } from "node:fs/promises";
10956
+ import { access as access4, readFile as readFile13 } from "node:fs/promises";
10897
10957
  import path34 from "node:path";
10898
- import { parse as parse4 } from "yaml";
10958
+ import { parse as parse5 } from "yaml";
10899
10959
  function isRecord(value) {
10900
10960
  return typeof value === "object" && value !== null && !Array.isArray(value);
10901
10961
  }
@@ -10938,8 +10998,8 @@ async function readTargetDefinitions(filePath) {
10938
10998
  if (!await fileExists3(absolutePath)) {
10939
10999
  throw new Error(`targets.yaml not found at ${absolutePath}`);
10940
11000
  }
10941
- const raw = await readFile12(absolutePath, "utf8");
10942
- const parsed = parse4(raw);
11001
+ const raw = await readFile13(absolutePath, "utf8");
11002
+ const parsed = parse5(raw);
10943
11003
  if (!isRecord(parsed)) {
10944
11004
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
10945
11005
  }
@@ -11381,6 +11441,18 @@ function toCamelCaseDeep(obj) {
11381
11441
  return obj;
11382
11442
  }
11383
11443
 
11444
+ // src/evaluation/workspace/repo-checkout.ts
11445
+ function getRepoCheckoutRef(checkout) {
11446
+ return checkout?.base_commit ?? checkout?.ref ?? "HEAD";
11447
+ }
11448
+ function getRepoCheckoutTargets(repos) {
11449
+ if (!repos) return [];
11450
+ return repos.filter((repo) => repo.checkout?.base_commit || repo.checkout?.ref).map((repo) => ({
11451
+ path: repo.path,
11452
+ ref: getRepoCheckoutRef(repo.checkout)
11453
+ }));
11454
+ }
11455
+
11384
11456
  // src/evaluation/evaluators/code-evaluator.ts
11385
11457
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
11386
11458
  var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
@@ -11515,13 +11587,31 @@ var CodeEvaluator = class {
11515
11587
  const workspaceEnv = context.workspacePath ? { AGENTV_WORKSPACE_PATH: context.workspacePath } : void 0;
11516
11588
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
11517
11589
  try {
11518
- const stdout = await executeScript(
11519
- this.command,
11520
- inputPayload,
11521
- this.agentTimeoutMs,
11522
- this.cwd,
11523
- env
11524
- );
11590
+ let stdout;
11591
+ if (context.dockerConfig) {
11592
+ const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await import("./docker-workspace-RPPXBT27.js");
11593
+ const dockerProvider = new DockerWorkspaceProvider2(context.dockerConfig);
11594
+ const result = await dockerProvider.runGraderInContainer({
11595
+ command: [...this.command],
11596
+ stdin: inputPayload,
11597
+ repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos)
11598
+ });
11599
+ if (result.exitCode !== 0) {
11600
+ const trimmedErr = result.stderr.trim();
11601
+ throw new Error(
11602
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
11603
+ );
11604
+ }
11605
+ stdout = result.stdout.trim();
11606
+ } else {
11607
+ stdout = await executeScript(
11608
+ this.command,
11609
+ inputPayload,
11610
+ this.agentTimeoutMs,
11611
+ this.cwd,
11612
+ env
11613
+ );
11614
+ }
11525
11615
  const parsed = parseJsonSafe(stdout);
11526
11616
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
11527
11617
  const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
@@ -12682,11 +12772,11 @@ function createFilesystemTools(workspacePath) {
12682
12772
  execute: async (input) => {
12683
12773
  try {
12684
12774
  const resolved = resolveSandboxed(workspacePath, input.path);
12685
- const stat11 = await fs2.stat(resolved);
12686
- if (stat11.isDirectory()) {
12775
+ const stat12 = await fs2.stat(resolved);
12776
+ if (stat12.isDirectory()) {
12687
12777
  return { error: `'${input.path}' is a directory, not a file` };
12688
12778
  }
12689
- const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
12779
+ const buffer = Buffer.alloc(Math.min(stat12.size, MAX_FILE_SIZE));
12690
12780
  const fd = await fs2.open(resolved, "r");
12691
12781
  try {
12692
12782
  await fd.read(buffer, 0, buffer.length, 0);
@@ -12694,8 +12784,8 @@ function createFilesystemTools(workspacePath) {
12694
12784
  await fd.close();
12695
12785
  }
12696
12786
  const content = buffer.toString("utf-8");
12697
- const truncated = stat11.size > MAX_FILE_SIZE;
12698
- return { content, truncated, size: stat11.size };
12787
+ const truncated = stat12.size > MAX_FILE_SIZE;
12788
+ return { content, truncated, size: stat12.size };
12699
12789
  } catch (error) {
12700
12790
  return { error: error instanceof Error ? error.message : String(error) };
12701
12791
  }
@@ -12746,8 +12836,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
12746
12836
  const ext = path36.extname(entry.name).toLowerCase();
12747
12837
  if (BINARY_EXTENSIONS.has(ext)) continue;
12748
12838
  try {
12749
- const stat11 = await fs2.stat(fullPath);
12750
- if (stat11.size > MAX_FILE_SIZE) continue;
12839
+ const stat12 = await fs2.stat(fullPath);
12840
+ if (stat12.size > MAX_FILE_SIZE) continue;
12751
12841
  const content = await fs2.readFile(fullPath, "utf-8");
12752
12842
  const lines = content.split("\n");
12753
12843
  for (let i = 0; i < lines.length; i++) {
@@ -13388,115 +13478,115 @@ var FieldAccuracyEvaluator = class {
13388
13478
  * Evaluate a single field against the expected value.
13389
13479
  */
13390
13480
  evaluateField(fieldConfig, candidateData, expectedData) {
13391
- const { path: path52, match, required = true, weight = 1 } = fieldConfig;
13392
- const candidateValue = resolvePath(candidateData, path52);
13393
- const expectedValue = resolvePath(expectedData, path52);
13481
+ const { path: path53, match, required = true, weight = 1 } = fieldConfig;
13482
+ const candidateValue = resolvePath(candidateData, path53);
13483
+ const expectedValue = resolvePath(expectedData, path53);
13394
13484
  if (expectedValue === void 0) {
13395
13485
  return {
13396
- path: path52,
13486
+ path: path53,
13397
13487
  score: 1,
13398
13488
  // No expected value means no comparison needed
13399
13489
  weight,
13400
13490
  hit: true,
13401
- message: `${path52}: no expected value`
13491
+ message: `${path53}: no expected value`
13402
13492
  };
13403
13493
  }
13404
13494
  if (candidateValue === void 0) {
13405
13495
  if (required) {
13406
13496
  return {
13407
- path: path52,
13497
+ path: path53,
13408
13498
  score: 0,
13409
13499
  weight,
13410
13500
  hit: false,
13411
- message: `${path52} (required, missing)`
13501
+ message: `${path53} (required, missing)`
13412
13502
  };
13413
13503
  }
13414
13504
  return {
13415
- path: path52,
13505
+ path: path53,
13416
13506
  score: 1,
13417
13507
  // Don't penalize missing optional fields
13418
13508
  weight: 0,
13419
13509
  // Zero weight means it won't affect the score
13420
13510
  hit: true,
13421
- message: `${path52}: optional field missing`
13511
+ message: `${path53}: optional field missing`
13422
13512
  };
13423
13513
  }
13424
13514
  switch (match) {
13425
13515
  case "exact":
13426
- return this.compareExact(path52, candidateValue, expectedValue, weight);
13516
+ return this.compareExact(path53, candidateValue, expectedValue, weight);
13427
13517
  case "numeric_tolerance":
13428
13518
  return this.compareNumericTolerance(
13429
- path52,
13519
+ path53,
13430
13520
  candidateValue,
13431
13521
  expectedValue,
13432
13522
  fieldConfig,
13433
13523
  weight
13434
13524
  );
13435
13525
  case "date":
13436
- return this.compareDate(path52, candidateValue, expectedValue, fieldConfig, weight);
13526
+ return this.compareDate(path53, candidateValue, expectedValue, fieldConfig, weight);
13437
13527
  default:
13438
13528
  return {
13439
- path: path52,
13529
+ path: path53,
13440
13530
  score: 0,
13441
13531
  weight,
13442
13532
  hit: false,
13443
- message: `${path52}: unknown match type "${match}"`
13533
+ message: `${path53}: unknown match type "${match}"`
13444
13534
  };
13445
13535
  }
13446
13536
  }
13447
13537
  /**
13448
13538
  * Exact equality comparison.
13449
13539
  */
13450
- compareExact(path52, candidateValue, expectedValue, weight) {
13540
+ compareExact(path53, candidateValue, expectedValue, weight) {
13451
13541
  if (deepEqual(candidateValue, expectedValue)) {
13452
13542
  return {
13453
- path: path52,
13543
+ path: path53,
13454
13544
  score: 1,
13455
13545
  weight,
13456
13546
  hit: true,
13457
- message: path52
13547
+ message: path53
13458
13548
  };
13459
13549
  }
13460
13550
  if (typeof candidateValue !== typeof expectedValue) {
13461
13551
  return {
13462
- path: path52,
13552
+ path: path53,
13463
13553
  score: 0,
13464
13554
  weight,
13465
13555
  hit: false,
13466
- message: `${path52} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13556
+ message: `${path53} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13467
13557
  };
13468
13558
  }
13469
13559
  return {
13470
- path: path52,
13560
+ path: path53,
13471
13561
  score: 0,
13472
13562
  weight,
13473
13563
  hit: false,
13474
- message: `${path52} (value mismatch)`
13564
+ message: `${path53} (value mismatch)`
13475
13565
  };
13476
13566
  }
13477
13567
  /**
13478
13568
  * Numeric comparison with absolute or relative tolerance.
13479
13569
  */
13480
- compareNumericTolerance(path52, candidateValue, expectedValue, fieldConfig, weight) {
13570
+ compareNumericTolerance(path53, candidateValue, expectedValue, fieldConfig, weight) {
13481
13571
  const { tolerance = 0, relative = false } = fieldConfig;
13482
13572
  const candidateNum = toNumber(candidateValue);
13483
13573
  const expectedNum = toNumber(expectedValue);
13484
13574
  if (candidateNum === null || expectedNum === null) {
13485
13575
  return {
13486
- path: path52,
13576
+ path: path53,
13487
13577
  score: 0,
13488
13578
  weight,
13489
13579
  hit: false,
13490
- message: `${path52} (non-numeric value)`
13580
+ message: `${path53} (non-numeric value)`
13491
13581
  };
13492
13582
  }
13493
13583
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
13494
13584
  return {
13495
- path: path52,
13585
+ path: path53,
13496
13586
  score: 0,
13497
13587
  weight,
13498
13588
  hit: false,
13499
- message: `${path52} (invalid numeric value)`
13589
+ message: `${path53} (invalid numeric value)`
13500
13590
  };
13501
13591
  }
13502
13592
  const diff = Math.abs(candidateNum - expectedNum);
@@ -13509,61 +13599,61 @@ var FieldAccuracyEvaluator = class {
13509
13599
  }
13510
13600
  if (withinTolerance) {
13511
13601
  return {
13512
- path: path52,
13602
+ path: path53,
13513
13603
  score: 1,
13514
13604
  weight,
13515
13605
  hit: true,
13516
- message: `${path52} (within tolerance: diff=${diff.toFixed(2)})`
13606
+ message: `${path53} (within tolerance: diff=${diff.toFixed(2)})`
13517
13607
  };
13518
13608
  }
13519
13609
  return {
13520
- path: path52,
13610
+ path: path53,
13521
13611
  score: 0,
13522
13612
  weight,
13523
13613
  hit: false,
13524
- message: `${path52} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13614
+ message: `${path53} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13525
13615
  };
13526
13616
  }
13527
13617
  /**
13528
13618
  * Date comparison with format normalization.
13529
13619
  */
13530
- compareDate(path52, candidateValue, expectedValue, fieldConfig, weight) {
13620
+ compareDate(path53, candidateValue, expectedValue, fieldConfig, weight) {
13531
13621
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
13532
13622
  const candidateDate = parseDate(String(candidateValue), formats);
13533
13623
  const expectedDate = parseDate(String(expectedValue), formats);
13534
13624
  if (candidateDate === null) {
13535
13625
  return {
13536
- path: path52,
13626
+ path: path53,
13537
13627
  score: 0,
13538
13628
  weight,
13539
13629
  hit: false,
13540
- message: `${path52} (unparseable candidate date)`
13630
+ message: `${path53} (unparseable candidate date)`
13541
13631
  };
13542
13632
  }
13543
13633
  if (expectedDate === null) {
13544
13634
  return {
13545
- path: path52,
13635
+ path: path53,
13546
13636
  score: 0,
13547
13637
  weight,
13548
13638
  hit: false,
13549
- message: `${path52} (unparseable expected date)`
13639
+ message: `${path53} (unparseable expected date)`
13550
13640
  };
13551
13641
  }
13552
13642
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
13553
13643
  return {
13554
- path: path52,
13644
+ path: path53,
13555
13645
  score: 1,
13556
13646
  weight,
13557
13647
  hit: true,
13558
- message: path52
13648
+ message: path53
13559
13649
  };
13560
13650
  }
13561
13651
  return {
13562
- path: path52,
13652
+ path: path53,
13563
13653
  score: 0,
13564
13654
  weight,
13565
13655
  hit: false,
13566
- message: `${path52} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13656
+ message: `${path53} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13567
13657
  };
13568
13658
  }
13569
13659
  /**
@@ -13596,11 +13686,11 @@ var FieldAccuracyEvaluator = class {
13596
13686
  };
13597
13687
  }
13598
13688
  };
13599
- function resolvePath(obj, path52) {
13600
- if (!path52 || !obj) {
13689
+ function resolvePath(obj, path53) {
13690
+ if (!path53 || !obj) {
13601
13691
  return void 0;
13602
13692
  }
13603
- const parts = path52.split(/\.|\[|\]/).filter((p) => p.length > 0);
13693
+ const parts = path53.split(/\.|\[|\]/).filter((p) => p.length > 0);
13604
13694
  let current = obj;
13605
13695
  for (const part of parts) {
13606
13696
  if (current === null || current === void 0) {
@@ -14092,8 +14182,8 @@ var TokenUsageEvaluator = class {
14092
14182
  };
14093
14183
 
14094
14184
  // src/evaluation/evaluators/tool-trajectory.ts
14095
- function getNestedValue(obj, path52) {
14096
- const parts = path52.split(".");
14185
+ function getNestedValue(obj, path53) {
14186
+ const parts = path53.split(".");
14097
14187
  let current = obj;
14098
14188
  for (const part of parts) {
14099
14189
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -14959,6 +15049,15 @@ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
14959
15049
  }
14960
15050
  return void 0;
14961
15051
  }
15052
+ function containsTemplateVariables(text) {
15053
+ const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
15054
+ for (const match of text.matchAll(variablePattern)) {
15055
+ if (VALID_TEMPLATE_VARIABLES.has(match[1])) {
15056
+ return true;
15057
+ }
15058
+ }
15059
+ return false;
15060
+ }
14962
15061
  async function executePromptTemplate(script, context, config, timeoutMs) {
14963
15062
  const payload = {
14964
15063
  criteria: context.evalCase.criteria,
@@ -15031,9 +15130,20 @@ var llmGraderFactory = (config, context) => {
15031
15130
  },
15032
15131
  agentTimeoutMs
15033
15132
  );
15133
+ const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
15134
+ let evaluatorTemplateOverride;
15135
+ let evalCase = evalContext.evalCase;
15136
+ if (customPrompt) {
15137
+ if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
15138
+ evaluatorTemplateOverride = customPrompt;
15139
+ } else {
15140
+ evalCase = { ...evalCase, criteria: customPrompt };
15141
+ }
15142
+ }
15034
15143
  return evaluator.evaluate({
15035
15144
  ...evalContext,
15036
- evaluatorTemplateOverride: customPrompt,
15145
+ evalCase,
15146
+ evaluatorTemplateOverride,
15037
15147
  evaluator: c
15038
15148
  });
15039
15149
  }
@@ -15630,7 +15740,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
15630
15740
  import { execFile } from "node:child_process";
15631
15741
  import { createHash } from "node:crypto";
15632
15742
  import { existsSync as existsSync3 } from "node:fs";
15633
- import { cp as cp2, mkdir as mkdir13, readFile as readFile13, readdir as readdir5, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
15743
+ import { cp as cp2, mkdir as mkdir13, readFile as readFile14, readdir as readdir5, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
15634
15744
  import path42 from "node:path";
15635
15745
  import { promisify as promisify5 } from "node:util";
15636
15746
  var execFileAsync = promisify5(execFile);
@@ -15658,12 +15768,14 @@ async function git(args, opts) {
15658
15768
  return stdout.trim();
15659
15769
  }
15660
15770
  function normalizeRepoForFingerprint(repo) {
15661
- const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
15662
- const result = {
15663
- path: repo.path,
15664
- source,
15665
- ref: repo.checkout?.ref ?? "HEAD"
15666
- };
15771
+ const result = {};
15772
+ if (repo.path) {
15773
+ result.path = repo.path;
15774
+ }
15775
+ if (repo.source) {
15776
+ result.source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
15777
+ }
15778
+ result.ref = getRepoCheckoutRef(repo.checkout);
15667
15779
  if (repo.clone?.depth !== void 0) {
15668
15780
  result.depth = repo.clone.depth;
15669
15781
  }
@@ -15677,7 +15789,7 @@ function normalizeRepoForFingerprint(repo) {
15677
15789
  }
15678
15790
  function computeWorkspaceFingerprint(repos) {
15679
15791
  const canonical = {
15680
- repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
15792
+ repos: [...repos].sort((a, b) => (a.path ?? "").localeCompare(b.path ?? "")).map(normalizeRepoForFingerprint)
15681
15793
  };
15682
15794
  return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
15683
15795
  }
@@ -15791,7 +15903,7 @@ var WorkspacePoolManager = class {
15791
15903
  throw err;
15792
15904
  }
15793
15905
  try {
15794
- const pidStr = await readFile13(lockPath, "utf-8");
15906
+ const pidStr = await readFile14(lockPath, "utf-8");
15795
15907
  const pid = Number.parseInt(pidStr.trim(), 10);
15796
15908
  if (!Number.isNaN(pid)) {
15797
15909
  try {
@@ -15818,7 +15930,7 @@ var WorkspacePoolManager = class {
15818
15930
  async checkDrift(poolDir, fingerprint) {
15819
15931
  const metadataPath = path42.join(poolDir, "metadata.json");
15820
15932
  try {
15821
- const raw = await readFile13(metadataPath, "utf-8");
15933
+ const raw = await readFile14(metadataPath, "utf-8");
15822
15934
  const metadata = JSON.parse(raw);
15823
15935
  return metadata.fingerprint !== fingerprint;
15824
15936
  } catch {
@@ -15843,7 +15955,7 @@ var WorkspacePoolManager = class {
15843
15955
  const lockPath = path42.join(poolDir, `${entry}.lock`);
15844
15956
  if (existsSync3(lockPath)) {
15845
15957
  try {
15846
- const pidStr = await readFile13(lockPath, "utf-8");
15958
+ const pidStr = await readFile14(lockPath, "utf-8");
15847
15959
  const pid = Number.parseInt(pidStr.trim(), 10);
15848
15960
  if (!Number.isNaN(pid)) {
15849
15961
  try {
@@ -15871,6 +15983,7 @@ var WorkspacePoolManager = class {
15871
15983
  */
15872
15984
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
15873
15985
  for (const repo of repos) {
15986
+ if (!repo.path || !repo.source) continue;
15874
15987
  const repoDir = path42.join(slotPath, repo.path);
15875
15988
  if (!existsSync3(repoDir)) {
15876
15989
  continue;
@@ -15878,7 +15991,7 @@ var WorkspacePoolManager = class {
15878
15991
  if (poolReset === "none") {
15879
15992
  continue;
15880
15993
  }
15881
- const ref = repo.checkout?.ref ?? "HEAD";
15994
+ const ref = getRepoCheckoutRef(repo.checkout);
15882
15995
  const resolve = repo.checkout?.resolve ?? "remote";
15883
15996
  if (resolve === "remote") {
15884
15997
  const fetchArgs = ["fetch", "origin", ref];
@@ -15895,8 +16008,8 @@ var WorkspacePoolManager = class {
15895
16008
  }
15896
16009
  if (templatePath) {
15897
16010
  const repoDirNames = new Set(
15898
- repos.map((r) => {
15899
- const normalized = r.path.replace(/^\.\//, "");
16011
+ repos.filter((r) => r.path).map((r) => {
16012
+ const normalized = (r.path ?? "").replace(/^\.\//, "");
15900
16013
  return normalized.split("/")[0];
15901
16014
  })
15902
16015
  );
@@ -15951,17 +16064,17 @@ var RepoManager = class {
15951
16064
  static validateLocalPaths(repos) {
15952
16065
  const errors = [];
15953
16066
  for (const repo of repos) {
15954
- if (repo.source.type !== "local") continue;
16067
+ if (!repo.source || repo.source.type !== "local") continue;
15955
16068
  const sourcePath = repo.source.path;
15956
16069
  if (!sourcePath || sourcePath.trim() === "") {
15957
16070
  errors.push({
15958
- repoPath: repo.path,
16071
+ repoPath: repo.path ?? "(none)",
15959
16072
  resolvedSourcePath: sourcePath ?? "",
15960
16073
  reason: "empty_path"
15961
16074
  });
15962
16075
  } else if (!existsSync4(sourcePath)) {
15963
16076
  errors.push({
15964
- repoPath: repo.path,
16077
+ repoPath: repo.path ?? "(none)",
15965
16078
  resolvedSourcePath: sourcePath,
15966
16079
  reason: "not_found"
15967
16080
  });
@@ -16008,6 +16121,12 @@ ${lines.join("\n")}`;
16008
16121
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
16009
16122
  */
16010
16123
  async materialize(repo, workspacePath) {
16124
+ if (!repo.source || !repo.path) {
16125
+ if (this.verbose) {
16126
+ console.log(`[repo] materialize skip path=${repo.path ?? "(none)"} (no source or path)`);
16127
+ }
16128
+ return;
16129
+ }
16011
16130
  const targetDir = path43.join(workspacePath, repo.path);
16012
16131
  const sourceUrl = getSourceUrl(repo.source);
16013
16132
  const startedAt = Date.now();
@@ -16031,7 +16150,7 @@ ${lines.join("\n")}`;
16031
16150
  await this.runGit(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
16032
16151
  await this.runGit(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
16033
16152
  }
16034
- const ref = repo.checkout?.ref ?? "HEAD";
16153
+ const ref = getRepoCheckoutRef(repo.checkout);
16035
16154
  const resolve = repo.checkout?.resolve ?? "remote";
16036
16155
  let resolvedSha;
16037
16156
  if (resolve === "remote" && repo.source.type === "git") {
@@ -16083,22 +16202,26 @@ ${lines.join("\n")}`;
16083
16202
  );
16084
16203
  }
16085
16204
  }
16086
- /** Materialize all repos into the workspace. */
16205
+ /** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
16087
16206
  async materializeAll(repos, workspacePath) {
16207
+ const materializableRepos = repos.filter((r) => r.source);
16088
16208
  if (this.verbose) {
16089
- console.log(`[repo] materializeAll count=${repos.length} workspace=${workspacePath}`);
16209
+ console.log(
16210
+ `[repo] materializeAll count=${materializableRepos.length} (${repos.length - materializableRepos.length} skipped, no source) workspace=${workspacePath}`
16211
+ );
16090
16212
  }
16091
- for (const repo of repos) {
16213
+ for (const repo of materializableRepos) {
16092
16214
  await this.materialize(repo, workspacePath);
16093
16215
  }
16094
16216
  if (this.verbose) {
16095
16217
  console.log("[repo] materializeAll complete");
16096
16218
  }
16097
16219
  }
16098
- /** Reset repos in workspace to their checkout state. */
16220
+ /** Reset repos in workspace to their checkout state. Skips repos without path or source. */
16099
16221
  async reset(repos, workspacePath, reset) {
16100
16222
  const cleanFlag = reset === "strict" ? "-fdx" : "-fd";
16101
16223
  for (const repo of repos) {
16224
+ if (!repo.path || !repo.source) continue;
16102
16225
  const targetDir = path43.join(workspacePath, repo.path);
16103
16226
  await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
16104
16227
  await this.runGit(["clean", cleanFlag], { cwd: targetDir });
@@ -16422,7 +16545,8 @@ async function runEvaluation(options) {
16422
16545
  for (const ec of filteredEvalCases) {
16423
16546
  if (ec.workspace?.repos) {
16424
16547
  for (const repo of ec.workspace.repos) {
16425
- const key = `${repo.path}::${repo.source.type === "local" ? repo.source.path : ""}`;
16548
+ if (!repo.source) continue;
16549
+ const key = `${repo.path ?? ""}::${repo.source.type === "local" ? repo.source.path : ""}`;
16426
16550
  if (!allRepos.has(key)) {
16427
16551
  allRepos.set(key, repo);
16428
16552
  }
@@ -16435,7 +16559,7 @@ async function runEvaluation(options) {
16435
16559
  const message = RepoManager.formatValidationErrors(localPathErrors);
16436
16560
  console.warn(`Warning: ${message}`);
16437
16561
  const invalidLocalRepoPaths = new Set(localPathErrors.map((e) => e.repoPath));
16438
- if (suiteWorkspace?.repos?.some((r) => invalidLocalRepoPaths.has(r.path))) {
16562
+ if (suiteWorkspace?.repos?.some((r) => r.path && invalidLocalRepoPaths.has(r.path))) {
16439
16563
  throw new Error(message);
16440
16564
  }
16441
16565
  }
@@ -16568,6 +16692,7 @@ async function runEvaluation(options) {
16568
16692
  try {
16569
16693
  if (needsPerRepoCheck) {
16570
16694
  for (const repo of suiteWorkspace.repos) {
16695
+ if (!repo.path || !repo.source) continue;
16571
16696
  const targetDir = path45.join(sharedWorkspacePath, repo.path);
16572
16697
  if (existsSync5(targetDir)) {
16573
16698
  setupLog(`reusing existing repo at: ${targetDir}`);
@@ -16592,6 +16717,19 @@ async function runEvaluation(options) {
16592
16717
  throw new Error(`Failed to materialize repos: ${message}`);
16593
16718
  }
16594
16719
  }
16720
+ const suiteDockerConfig = suiteWorkspace?.docker;
16721
+ if (suiteDockerConfig) {
16722
+ setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
16723
+ const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await import("./docker-workspace-RPPXBT27.js");
16724
+ const dockerSetup = new DockerWorkspaceProvider2(suiteDockerConfig);
16725
+ if (!await dockerSetup.isDockerAvailable()) {
16726
+ throw new Error(
16727
+ "Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running."
16728
+ );
16729
+ }
16730
+ await dockerSetup.pullImage();
16731
+ setupLog("Docker image pull complete");
16732
+ }
16595
16733
  const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
16596
16734
  const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
16597
16735
  if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
@@ -16952,11 +17090,9 @@ async function runBatchEvaluation(options) {
16952
17090
  const promptInputs = promptInputsList[index];
16953
17091
  return {
16954
17092
  question: promptInputs.question,
17093
+ systemPrompt: promptInputs.systemMessage,
16955
17094
  inputFiles: evalCase.file_paths,
16956
- evalCaseId: evalCase.id,
16957
- metadata: {
16958
- systemPrompt: promptInputs.systemMessage ?? ""
16959
- }
17095
+ evalCaseId: evalCase.id
16960
17096
  };
16961
17097
  });
16962
17098
  const batchResponse = await provider.invokeBatch?.(batchRequests);
@@ -17487,6 +17623,7 @@ async function runEvalCase(options) {
17487
17623
  availableTargets,
17488
17624
  fileChanges,
17489
17625
  workspacePath,
17626
+ dockerConfig: evalCase.workspace?.docker,
17490
17627
  verbose,
17491
17628
  threshold: evalCase.threshold ?? caseThreshold
17492
17629
  });
@@ -17680,6 +17817,7 @@ async function evaluateCandidate(options) {
17680
17817
  availableTargets,
17681
17818
  fileChanges,
17682
17819
  workspacePath,
17820
+ dockerConfig,
17683
17821
  threshold: evalThreshold
17684
17822
  } = options;
17685
17823
  const gradeTimestamp = nowFn();
@@ -17706,6 +17844,7 @@ async function evaluateCandidate(options) {
17706
17844
  availableTargets,
17707
17845
  fileChanges,
17708
17846
  workspacePath,
17847
+ dockerConfig,
17709
17848
  threshold: evalThreshold
17710
17849
  });
17711
17850
  const completedAt = nowFn();
@@ -17781,6 +17920,7 @@ async function runEvaluatorsForCase(options) {
17781
17920
  availableTargets,
17782
17921
  fileChanges,
17783
17922
  workspacePath,
17923
+ dockerConfig,
17784
17924
  threshold
17785
17925
  } = options;
17786
17926
  if (evalCase.assertions && evalCase.assertions.length > 0) {
@@ -17808,6 +17948,7 @@ async function runEvaluatorsForCase(options) {
17808
17948
  availableTargets,
17809
17949
  fileChanges,
17810
17950
  workspacePath,
17951
+ dockerConfig,
17811
17952
  threshold
17812
17953
  });
17813
17954
  }
@@ -17837,6 +17978,7 @@ async function runEvaluatorsForCase(options) {
17837
17978
  availableTargets,
17838
17979
  fileChanges,
17839
17980
  workspacePath,
17981
+ dockerConfig,
17840
17982
  ...implicitEvaluator ? { evaluator: implicitEvaluator } : {}
17841
17983
  });
17842
17984
  return { score };
@@ -17875,7 +18017,8 @@ async function runEvaluatorList(options) {
17875
18017
  targetResolver,
17876
18018
  availableTargets,
17877
18019
  fileChanges,
17878
- workspacePath
18020
+ workspacePath,
18021
+ dockerConfig
17879
18022
  } = options;
17880
18023
  const scored = [];
17881
18024
  const scores = [];
@@ -17898,7 +18041,8 @@ async function runEvaluatorList(options) {
17898
18041
  targetResolver,
17899
18042
  availableTargets,
17900
18043
  fileChanges,
17901
- workspacePath
18044
+ workspacePath,
18045
+ dockerConfig
17902
18046
  };
17903
18047
  const evalFileDir = evalCase.file_paths[0] ? path45.dirname(evalCase.file_paths[0]) : process.cwd();
17904
18048
  const dispatchContext = {
@@ -18060,13 +18204,11 @@ async function invokeProvider(provider, options) {
18060
18204
  const braintrustSpanIds = streamCallbacks?.getActiveSpanIds?.() ?? void 0;
18061
18205
  return await provider.invoke({
18062
18206
  question: promptInputs.question,
18207
+ systemPrompt: promptInputs.systemMessage,
18063
18208
  chatPrompt: promptInputs.chatPrompt,
18064
18209
  inputFiles: evalCase.file_paths,
18065
18210
  evalCaseId: evalCase.id,
18066
18211
  attempt,
18067
- metadata: {
18068
- systemPrompt: promptInputs.systemMessage ?? ""
18069
- },
18070
18212
  signal: controller.signal,
18071
18213
  cwd,
18072
18214
  workspaceFile,
@@ -18436,7 +18578,7 @@ async function discoverDefaultTarget(repoRoot) {
18436
18578
  return null;
18437
18579
  }
18438
18580
  async function loadEnvHierarchy(repoRoot, startPath) {
18439
- const { readFileSync: readFileSync4 } = await import("node:fs");
18581
+ const { readFileSync: readFileSync5 } = await import("node:fs");
18440
18582
  const chain = buildDirectoryChain(startPath, repoRoot);
18441
18583
  const envFiles = [];
18442
18584
  for (const dir of chain) {
@@ -18445,7 +18587,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
18445
18587
  }
18446
18588
  for (let i = 0; i < envFiles.length; i++) {
18447
18589
  try {
18448
- const content = readFileSync4(envFiles[i], "utf8");
18590
+ const content = readFileSync5(envFiles[i], "utf8");
18449
18591
  for (const line of content.split("\n")) {
18450
18592
  const trimmed = line.trim();
18451
18593
  if (!trimmed || trimmed.startsWith("#")) continue;
@@ -18517,12 +18659,12 @@ var CONFIG_FILE_NAMES = [
18517
18659
  ".agentv/config.js"
18518
18660
  ];
18519
18661
  async function loadTsConfig(projectRoot) {
18520
- const { existsSync: existsSync8 } = await import("node:fs");
18662
+ const { existsSync: existsSync9 } = await import("node:fs");
18521
18663
  const { pathToFileURL: pathToFileURL2 } = await import("node:url");
18522
18664
  const { join: join2 } = await import("node:path");
18523
18665
  for (const fileName of CONFIG_FILE_NAMES) {
18524
18666
  const filePath = join2(projectRoot, fileName);
18525
- if (!existsSync8(filePath)) {
18667
+ if (!existsSync9(filePath)) {
18526
18668
  continue;
18527
18669
  }
18528
18670
  try {
@@ -18619,9 +18761,9 @@ function buildPrompt(criteria, question, referenceAnswer) {
18619
18761
  }
18620
18762
 
18621
18763
  // src/evaluation/workspace/deps-scanner.ts
18622
- import { readFile as readFile14 } from "node:fs/promises";
18764
+ import { readFile as readFile15 } from "node:fs/promises";
18623
18765
  import path47 from "node:path";
18624
- import { parse as parse5 } from "yaml";
18766
+ import { parse as parse6 } from "yaml";
18625
18767
  function normalizeGitUrl(url) {
18626
18768
  let normalized = url.replace(/\.git$/, "");
18627
18769
  try {
@@ -18639,7 +18781,7 @@ async function scanRepoDeps(evalFilePaths) {
18639
18781
  try {
18640
18782
  const repos = await extractReposFromEvalFile(filePath);
18641
18783
  for (const repo of repos) {
18642
- if (repo.source.type !== "git") continue;
18784
+ if (!repo.source || repo.source.type !== "git") continue;
18643
18785
  const ref = repo.checkout?.ref;
18644
18786
  const key = `${normalizeGitUrl(repo.source.url)}\0${ref ?? ""}`;
18645
18787
  const existing = seen.get(key);
@@ -18667,8 +18809,8 @@ async function scanRepoDeps(evalFilePaths) {
18667
18809
  return { repos: [...seen.values()], errors };
18668
18810
  }
18669
18811
  async function extractReposFromEvalFile(filePath) {
18670
- const content = await readFile14(filePath, "utf8");
18671
- const parsed = interpolateEnv(parse5(content), process.env);
18812
+ const content = await readFile15(filePath, "utf8");
18813
+ const parsed = interpolateEnv(parse6(content), process.env);
18672
18814
  if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return [];
18673
18815
  const obj = parsed;
18674
18816
  const evalFileDir = path47.dirname(path47.resolve(filePath));
@@ -18688,8 +18830,8 @@ async function extractReposFromEvalFile(filePath) {
18688
18830
  async function extractReposFromWorkspaceRaw(raw, evalFileDir) {
18689
18831
  if (typeof raw === "string") {
18690
18832
  const workspaceFilePath = path47.resolve(evalFileDir, raw);
18691
- const content = await readFile14(workspaceFilePath, "utf8");
18692
- const parsed = interpolateEnv(parse5(content), process.env);
18833
+ const content = await readFile15(workspaceFilePath, "utf8");
18834
+ const parsed = interpolateEnv(parse6(content), process.env);
18693
18835
  if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return [];
18694
18836
  return extractReposFromObject(parsed);
18695
18837
  }
@@ -18716,7 +18858,7 @@ function extractReposFromObject(obj) {
18716
18858
  }
18717
18859
 
18718
18860
  // src/evaluation/cache/response-cache.ts
18719
- import { mkdir as mkdir15, readFile as readFile15, writeFile as writeFile8 } from "node:fs/promises";
18861
+ import { mkdir as mkdir15, readFile as readFile16, writeFile as writeFile8 } from "node:fs/promises";
18720
18862
  import path48 from "node:path";
18721
18863
  var DEFAULT_CACHE_PATH = ".agentv/cache";
18722
18864
  var ResponseCache = class {
@@ -18727,7 +18869,7 @@ var ResponseCache = class {
18727
18869
  async get(key) {
18728
18870
  const filePath = this.keyToPath(key);
18729
18871
  try {
18730
- const data = await readFile15(filePath, "utf8");
18872
+ const data = await readFile16(filePath, "utf8");
18731
18873
  return JSON.parse(data);
18732
18874
  } catch {
18733
18875
  return void 0;
@@ -18756,20 +18898,301 @@ function shouldSkipCacheForTemperature(targetConfig) {
18756
18898
  return false;
18757
18899
  }
18758
18900
 
18759
- // src/projects.ts
18760
- import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
18901
+ // src/evaluation/results-repo.ts
18902
+ import { execFile as execFile3 } from "node:child_process";
18903
+ import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
18904
+ import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir8, rm as rm6, stat as stat9 } from "node:fs/promises";
18905
+ import os3 from "node:os";
18761
18906
  import path49 from "node:path";
18907
+ import { promisify as promisify7 } from "node:util";
18908
+ var execFileAsync3 = promisify7(execFile3);
18909
+ function sanitizeRepoSlug(repo) {
18910
+ return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
18911
+ }
18912
+ function withFriendlyGitHubAuthError(error) {
18913
+ const message = error instanceof Error ? error.message : String(error);
18914
+ const lower = message.toLowerCase();
18915
+ if (lower.includes("authentication failed") || lower.includes("could not read username") || lower.includes("permission denied") || lower.includes("not logged into any github hosts")) {
18916
+ return new Error(`${message}. Run 'gh auth login' to authenticate.`);
18917
+ }
18918
+ return new Error(message);
18919
+ }
18920
+ function normalizeResultsExportConfig(config) {
18921
+ return {
18922
+ repo: config.repo.trim(),
18923
+ path: config.path.trim().replace(/^\/+|\/+$/g, ""),
18924
+ auto_push: config.auto_push === true,
18925
+ branch_prefix: config.branch_prefix?.trim() || "eval-results"
18926
+ };
18927
+ }
18928
+ function resolveResultsRepoUrl(repo) {
18929
+ if (repo.includes("://") || repo.startsWith("git@")) {
18930
+ return repo;
18931
+ }
18932
+ return `https://github.com/${repo}.git`;
18933
+ }
18934
+ function getResultsRepoCachePaths(repo) {
18935
+ const rootDir = path49.join(getAgentvHome(), "cache", "results-repo", sanitizeRepoSlug(repo));
18936
+ return {
18937
+ rootDir,
18938
+ repoDir: path49.join(rootDir, "repo"),
18939
+ statusFile: path49.join(rootDir, "status.json")
18940
+ };
18941
+ }
18942
+ function readPersistedStatus(statusFile) {
18943
+ if (!existsSync7(statusFile)) {
18944
+ return {};
18945
+ }
18946
+ try {
18947
+ return JSON.parse(readFileSync3(statusFile, "utf8"));
18948
+ } catch {
18949
+ return {};
18950
+ }
18951
+ }
18952
+ function writePersistedStatus(statusFile, status) {
18953
+ mkdirSync2(path49.dirname(statusFile), { recursive: true });
18954
+ writeFileSync(statusFile, `${JSON.stringify(status, null, 2)}
18955
+ `, "utf8");
18956
+ }
18957
+ async function runCommand(executable, args, options) {
18958
+ try {
18959
+ const { stdout, stderr } = await execFileAsync3(executable, [...args], {
18960
+ cwd: options?.cwd,
18961
+ env: process.env
18962
+ });
18963
+ return { stdout, stderr };
18964
+ } catch (error) {
18965
+ if (options?.check === false && error && typeof error === "object") {
18966
+ const execError = error;
18967
+ return {
18968
+ stdout: execError.stdout ?? "",
18969
+ stderr: execError.stderr ?? ""
18970
+ };
18971
+ }
18972
+ throw withFriendlyGitHubAuthError(error);
18973
+ }
18974
+ }
18975
+ async function runGit(args, options) {
18976
+ return runCommand("git", args, options);
18977
+ }
18978
+ async function runGh(args, options) {
18979
+ return runCommand("gh", args, options);
18980
+ }
18981
+ async function resolveDefaultBranch(repoDir) {
18982
+ try {
18983
+ const { stdout } = await runGit(["symbolic-ref", "refs/remotes/origin/HEAD"], { cwd: repoDir });
18984
+ const ref = stdout.trim();
18985
+ const prefix = "refs/remotes/origin/";
18986
+ if (ref.startsWith(prefix)) {
18987
+ return ref.slice(prefix.length);
18988
+ }
18989
+ } catch {
18990
+ }
18991
+ for (const candidate of ["main", "master"]) {
18992
+ try {
18993
+ await runGit(["rev-parse", "--verify", `origin/${candidate}`], { cwd: repoDir });
18994
+ return candidate;
18995
+ } catch {
18996
+ }
18997
+ }
18998
+ return "main";
18999
+ }
19000
+ async function updateCacheRepo(repoDir, baseBranch) {
19001
+ await runGit(["fetch", "origin", "--prune"], { cwd: repoDir });
19002
+ await runGit(["checkout", baseBranch], { cwd: repoDir });
19003
+ await runGit(["pull", "--ff-only", "origin", baseBranch], { cwd: repoDir });
19004
+ }
19005
+ function updateStatusFile(config, patch) {
19006
+ const cachePaths = getResultsRepoCachePaths(config.repo);
19007
+ const current = readPersistedStatus(cachePaths.statusFile);
19008
+ writePersistedStatus(cachePaths.statusFile, {
19009
+ ...current,
19010
+ ...patch
19011
+ });
19012
+ }
19013
+ async function ensureResultsRepoClone(config) {
19014
+ const normalized = normalizeResultsExportConfig(config);
19015
+ const cachePaths = getResultsRepoCachePaths(normalized.repo);
19016
+ mkdirSync2(cachePaths.rootDir, { recursive: true });
19017
+ if (!existsSync7(cachePaths.repoDir)) {
19018
+ try {
19019
+ await runGit([
19020
+ "clone",
19021
+ "--filter=blob:none",
19022
+ resolveResultsRepoUrl(normalized.repo),
19023
+ cachePaths.repoDir
19024
+ ]);
19025
+ return cachePaths.repoDir;
19026
+ } catch (error) {
19027
+ updateStatusFile(normalized, { last_error: withFriendlyGitHubAuthError(error).message });
19028
+ throw withFriendlyGitHubAuthError(error);
19029
+ }
19030
+ }
19031
+ if (!existsSync7(path49.join(cachePaths.repoDir, ".git"))) {
19032
+ throw new Error(`Results repo cache is not a git repository: ${cachePaths.repoDir}`);
19033
+ }
19034
+ return cachePaths.repoDir;
19035
+ }
19036
+ function getResultsRepoStatus(config) {
19037
+ if (!config) {
19038
+ return {
19039
+ configured: false,
19040
+ available: false,
19041
+ repo: "",
19042
+ cache_dir: ""
19043
+ };
19044
+ }
19045
+ const normalized = normalizeResultsExportConfig(config);
19046
+ const cachePaths = getResultsRepoCachePaths(normalized.repo);
19047
+ const persisted = readPersistedStatus(cachePaths.statusFile);
19048
+ return {
19049
+ configured: true,
19050
+ available: existsSync7(cachePaths.repoDir),
19051
+ repo: normalized.repo,
19052
+ path: normalized.path,
19053
+ auto_push: normalized.auto_push,
19054
+ branch_prefix: normalized.branch_prefix,
19055
+ cache_dir: cachePaths.repoDir,
19056
+ last_synced_at: persisted.last_synced_at,
19057
+ last_error: persisted.last_error
19058
+ };
19059
+ }
19060
+ async function syncResultsRepo(config) {
19061
+ const normalized = normalizeResultsExportConfig(config);
19062
+ try {
19063
+ const repoDir = await ensureResultsRepoClone(normalized);
19064
+ const baseBranch = await resolveDefaultBranch(repoDir);
19065
+ await updateCacheRepo(repoDir, baseBranch);
19066
+ updateStatusFile(normalized, {
19067
+ last_synced_at: (/* @__PURE__ */ new Date()).toISOString(),
19068
+ last_error: void 0
19069
+ });
19070
+ } catch (error) {
19071
+ updateStatusFile(normalized, {
19072
+ last_error: withFriendlyGitHubAuthError(error).message
19073
+ });
19074
+ throw withFriendlyGitHubAuthError(error);
19075
+ }
19076
+ return getResultsRepoStatus(normalized);
19077
+ }
19078
+ async function checkoutResultsRepoBranch(config, branchName) {
19079
+ const normalized = normalizeResultsExportConfig(config);
19080
+ const repoDir = await ensureResultsRepoClone(normalized);
19081
+ const baseBranch = await resolveDefaultBranch(repoDir);
19082
+ await updateCacheRepo(repoDir, baseBranch);
19083
+ await runGit(["checkout", "-B", branchName, `origin/${baseBranch}`], { cwd: repoDir });
19084
+ updateStatusFile(normalized, { last_error: void 0 });
19085
+ return {
19086
+ branchName,
19087
+ baseBranch,
19088
+ repoDir
19089
+ };
19090
+ }
19091
+ async function prepareResultsRepoBranch(config, branchName) {
19092
+ const normalized = normalizeResultsExportConfig(config);
19093
+ const cloneDir = await ensureResultsRepoClone(normalized);
19094
+ const baseBranch = await resolveDefaultBranch(cloneDir);
19095
+ await updateCacheRepo(cloneDir, baseBranch);
19096
+ const worktreeRoot = await mkdtemp3(path49.join(os3.tmpdir(), "agentv-results-repo-"));
19097
+ const worktreeDir = path49.join(worktreeRoot, "repo");
19098
+ await runGit(["worktree", "add", "-B", branchName, worktreeDir, `origin/${baseBranch}`], {
19099
+ cwd: cloneDir
19100
+ });
19101
+ return {
19102
+ branchName,
19103
+ baseBranch,
19104
+ repoDir: worktreeDir,
19105
+ cleanup: async () => {
19106
+ try {
19107
+ await runGit(["worktree", "remove", "--force", worktreeDir], { cwd: cloneDir });
19108
+ } finally {
19109
+ await rm6(worktreeRoot, { recursive: true, force: true }).catch(() => void 0);
19110
+ }
19111
+ }
19112
+ };
19113
+ }
19114
+ async function stageResultsArtifacts(params) {
19115
+ rmSync(params.destinationDir, { recursive: true, force: true });
19116
+ mkdirSync2(path49.dirname(params.destinationDir), { recursive: true });
19117
+ await cp3(params.sourceDir, params.destinationDir, { recursive: true });
19118
+ }
19119
+ function resolveResultsRepoRunsDir(config) {
19120
+ const normalized = normalizeResultsExportConfig(config);
19121
+ return path49.join(
19122
+ getResultsRepoCachePaths(normalized.repo).repoDir,
19123
+ ...normalized.path.split("/")
19124
+ );
19125
+ }
19126
+ async function directorySizeBytes(targetPath) {
19127
+ const entry = await stat9(targetPath);
19128
+ if (entry.isFile()) {
19129
+ return entry.size;
19130
+ }
19131
+ let total = 0;
19132
+ for (const child of await readdir8(targetPath, { withFileTypes: true })) {
19133
+ total += await directorySizeBytes(path49.join(targetPath, child.name));
19134
+ }
19135
+ return total;
19136
+ }
19137
+ async function commitAndPushResultsBranch(params) {
19138
+ await runGit(["add", "--all"], { cwd: params.repoDir });
19139
+ const { stdout: diffStdout } = await runGit(["status", "--porcelain"], {
19140
+ cwd: params.repoDir,
19141
+ check: false
19142
+ });
19143
+ if (diffStdout.trim().length === 0) {
19144
+ return false;
19145
+ }
19146
+ await runGit(["commit", "-m", params.commitMessage], { cwd: params.repoDir });
19147
+ await runGit(["push", "-u", "origin", params.branchName], { cwd: params.repoDir });
19148
+ return true;
19149
+ }
19150
+ async function pushResultsRepoBranch(config, branchName, cwd) {
19151
+ const normalized = normalizeResultsExportConfig(config);
19152
+ await runGit(["push", "-u", "origin", branchName], {
19153
+ cwd: cwd ?? getResultsRepoCachePaths(normalized.repo).repoDir
19154
+ });
19155
+ updateStatusFile(normalized, {
19156
+ last_synced_at: (/* @__PURE__ */ new Date()).toISOString(),
19157
+ last_error: void 0
19158
+ });
19159
+ }
19160
+ async function createDraftResultsPr(params) {
19161
+ const { stdout } = await runGh(
19162
+ [
19163
+ "pr",
19164
+ "create",
19165
+ "--draft",
19166
+ "--repo",
19167
+ params.repo,
19168
+ "--base",
19169
+ params.baseBranch,
19170
+ "--head",
19171
+ params.branchName,
19172
+ "--title",
19173
+ params.title,
19174
+ "--body",
19175
+ params.body
19176
+ ],
19177
+ { cwd: params.repoDir }
19178
+ );
19179
+ return stdout.trim();
19180
+ }
19181
+
19182
+ // src/projects.ts
19183
+ import { existsSync as existsSync8, mkdirSync as mkdirSync3, readFileSync as readFileSync4, readdirSync as readdirSync3, statSync as statSync2, writeFileSync as writeFileSync2 } from "node:fs";
19184
+ import path50 from "node:path";
18762
19185
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
18763
19186
  function getProjectsRegistryPath() {
18764
- return path49.join(getAgentvHome(), "projects.yaml");
19187
+ return path50.join(getAgentvHome(), "projects.yaml");
18765
19188
  }
18766
19189
  function loadProjectRegistry() {
18767
19190
  const registryPath = getProjectsRegistryPath();
18768
- if (!existsSync7(registryPath)) {
19191
+ if (!existsSync8(registryPath)) {
18769
19192
  return { projects: [] };
18770
19193
  }
18771
19194
  try {
18772
- const raw = readFileSync3(registryPath, "utf-8");
19195
+ const raw = readFileSync4(registryPath, "utf-8");
18773
19196
  const parsed = parseYaml3(raw);
18774
19197
  if (!parsed || !Array.isArray(parsed.projects)) {
18775
19198
  return { projects: [] };
@@ -18781,14 +19204,14 @@ function loadProjectRegistry() {
18781
19204
  }
18782
19205
  function saveProjectRegistry(registry) {
18783
19206
  const registryPath = getProjectsRegistryPath();
18784
- const dir = path49.dirname(registryPath);
18785
- if (!existsSync7(dir)) {
18786
- mkdirSync2(dir, { recursive: true });
19207
+ const dir = path50.dirname(registryPath);
19208
+ if (!existsSync8(dir)) {
19209
+ mkdirSync3(dir, { recursive: true });
18787
19210
  }
18788
- writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
19211
+ writeFileSync2(registryPath, stringifyYaml(registry), "utf-8");
18789
19212
  }
18790
19213
  function deriveProjectId(dirPath, existingIds) {
18791
- const base = path49.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
19214
+ const base = path50.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
18792
19215
  let candidate = base || "project";
18793
19216
  let suffix = 2;
18794
19217
  while (existingIds.includes(candidate)) {
@@ -18798,11 +19221,11 @@ function deriveProjectId(dirPath, existingIds) {
18798
19221
  return candidate;
18799
19222
  }
18800
19223
  function addProject(projectPath) {
18801
- const absPath = path49.resolve(projectPath);
18802
- if (!existsSync7(absPath)) {
19224
+ const absPath = path50.resolve(projectPath);
19225
+ if (!existsSync8(absPath)) {
18803
19226
  throw new Error(`Directory not found: ${absPath}`);
18804
19227
  }
18805
- if (!existsSync7(path49.join(absPath, ".agentv"))) {
19228
+ if (!existsSync8(path50.join(absPath, ".agentv"))) {
18806
19229
  throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
18807
19230
  }
18808
19231
  const registry = loadProjectRegistry();
@@ -18816,7 +19239,7 @@ function addProject(projectPath) {
18816
19239
  absPath,
18817
19240
  registry.projects.map((p) => p.id)
18818
19241
  ),
18819
- name: path49.basename(absPath),
19242
+ name: path50.basename(absPath),
18820
19243
  path: absPath,
18821
19244
  addedAt: now,
18822
19245
  lastOpenedAt: now
@@ -18845,14 +19268,14 @@ function touchProject(projectId) {
18845
19268
  }
18846
19269
  }
18847
19270
  function discoverProjects(rootDir, maxDepth = 2) {
18848
- const absRoot = path49.resolve(rootDir);
18849
- if (!existsSync7(absRoot) || !statSync2(absRoot).isDirectory()) {
19271
+ const absRoot = path50.resolve(rootDir);
19272
+ if (!existsSync8(absRoot) || !statSync2(absRoot).isDirectory()) {
18850
19273
  return [];
18851
19274
  }
18852
19275
  const results = [];
18853
19276
  function scan(dir, depth) {
18854
19277
  if (depth > maxDepth) return;
18855
- if (existsSync7(path49.join(dir, ".agentv"))) {
19278
+ if (existsSync8(path50.join(dir, ".agentv"))) {
18856
19279
  results.push(dir);
18857
19280
  return;
18858
19281
  }
@@ -18862,7 +19285,7 @@ function discoverProjects(rootDir, maxDepth = 2) {
18862
19285
  for (const entry of entries) {
18863
19286
  if (!entry.isDirectory()) continue;
18864
19287
  if (entry.name.startsWith(".") || entry.name === "node_modules") continue;
18865
- scan(path49.join(dir, entry.name), depth + 1);
19288
+ scan(path50.join(dir, entry.name), depth + 1);
18866
19289
  }
18867
19290
  } catch {
18868
19291
  }
@@ -19773,33 +20196,33 @@ function extractResponseItemContent(content) {
19773
20196
  }
19774
20197
 
19775
20198
  // src/import/codex-session-discovery.ts
19776
- import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
20199
+ import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
19777
20200
  import { homedir as homedir3 } from "node:os";
19778
- import path50 from "node:path";
19779
- var DEFAULT_SESSIONS_DIR = () => path50.join(homedir3(), ".codex", "sessions");
20201
+ import path51 from "node:path";
20202
+ var DEFAULT_SESSIONS_DIR = () => path51.join(homedir3(), ".codex", "sessions");
19780
20203
  async function discoverCodexSessions(opts) {
19781
20204
  const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
19782
20205
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
19783
20206
  const sessions = [];
19784
20207
  let yearDirs;
19785
20208
  try {
19786
- yearDirs = await readdir8(sessionsDir);
20209
+ yearDirs = await readdir9(sessionsDir);
19787
20210
  } catch {
19788
20211
  return [];
19789
20212
  }
19790
20213
  for (const year of yearDirs) {
19791
- const yearPath = path50.join(sessionsDir, year);
20214
+ const yearPath = path51.join(sessionsDir, year);
19792
20215
  let monthDirs;
19793
20216
  try {
19794
- monthDirs = await readdir8(yearPath);
20217
+ monthDirs = await readdir9(yearPath);
19795
20218
  } catch {
19796
20219
  continue;
19797
20220
  }
19798
20221
  for (const month of monthDirs) {
19799
- const monthPath = path50.join(yearPath, month);
20222
+ const monthPath = path51.join(yearPath, month);
19800
20223
  let dayDirs;
19801
20224
  try {
19802
- dayDirs = await readdir8(monthPath);
20225
+ dayDirs = await readdir9(monthPath);
19803
20226
  } catch {
19804
20227
  continue;
19805
20228
  }
@@ -19808,22 +20231,22 @@ async function discoverCodexSessions(opts) {
19808
20231
  const dirDate = `${year}-${month}-${day}`;
19809
20232
  if (dirDate !== opts.date) continue;
19810
20233
  }
19811
- const dayPath = path50.join(monthPath, day);
20234
+ const dayPath = path51.join(monthPath, day);
19812
20235
  let files;
19813
20236
  try {
19814
- files = await readdir8(dayPath);
20237
+ files = await readdir9(dayPath);
19815
20238
  } catch {
19816
20239
  continue;
19817
20240
  }
19818
20241
  for (const file of files) {
19819
20242
  if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
19820
- const filePath = path50.join(dayPath, file);
20243
+ const filePath = path51.join(dayPath, file);
19821
20244
  const nameWithoutExt = file.replace(/\.jsonl$/, "");
19822
20245
  const parts = nameWithoutExt.split("-");
19823
20246
  const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
19824
20247
  let updatedAt;
19825
20248
  try {
19826
- const fileStat = await stat9(filePath);
20249
+ const fileStat = await stat10(filePath);
19827
20250
  updatedAt = fileStat.mtime;
19828
20251
  } catch {
19829
20252
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -19838,10 +20261,10 @@ async function discoverCodexSessions(opts) {
19838
20261
  }
19839
20262
 
19840
20263
  // src/import/session-discovery.ts
19841
- import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
20264
+ import { readdir as readdir10, stat as stat11 } from "node:fs/promises";
19842
20265
  import { homedir as homedir4 } from "node:os";
19843
- import path51 from "node:path";
19844
- var DEFAULT_PROJECTS_DIR = () => path51.join(homedir4(), ".claude", "projects");
20266
+ import path52 from "node:path";
20267
+ var DEFAULT_PROJECTS_DIR = () => path52.join(homedir4(), ".claude", "projects");
19845
20268
  function encodeProjectPath(projectPath) {
19846
20269
  return projectPath.replace(/\//g, "-");
19847
20270
  }
@@ -19850,7 +20273,7 @@ async function discoverClaudeSessions(opts) {
19850
20273
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
19851
20274
  let projectDirs;
19852
20275
  try {
19853
- projectDirs = await readdir9(projectsDir);
20276
+ projectDirs = await readdir10(projectsDir);
19854
20277
  } catch {
19855
20278
  return [];
19856
20279
  }
@@ -19860,10 +20283,10 @@ async function discoverClaudeSessions(opts) {
19860
20283
  }
19861
20284
  const sessions = [];
19862
20285
  for (const projectDir of projectDirs) {
19863
- const dirPath = path51.join(projectsDir, projectDir);
20286
+ const dirPath = path52.join(projectsDir, projectDir);
19864
20287
  let entries;
19865
20288
  try {
19866
- entries = await readdir9(dirPath);
20289
+ entries = await readdir10(dirPath);
19867
20290
  } catch {
19868
20291
  continue;
19869
20292
  }
@@ -19871,10 +20294,10 @@ async function discoverClaudeSessions(opts) {
19871
20294
  if (!entry.endsWith(".jsonl")) continue;
19872
20295
  const sessionId = entry.replace(/\.jsonl$/, "");
19873
20296
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
19874
- const filePath = path51.join(dirPath, entry);
20297
+ const filePath = path52.join(dirPath, entry);
19875
20298
  let updatedAt;
19876
20299
  try {
19877
- const fileStat = await stat10(filePath);
20300
+ const fileStat = await stat11(filePath);
19878
20301
  updatedAt = fileStat.mtime;
19879
20302
  } catch {
19880
20303
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -19892,7 +20315,7 @@ async function discoverClaudeSessions(opts) {
19892
20315
  }
19893
20316
 
19894
20317
  // src/import/types.ts
19895
- import { readFile as readFile16 } from "node:fs/promises";
20318
+ import { readFile as readFile17 } from "node:fs/promises";
19896
20319
  function toTranscriptJsonLine(entry) {
19897
20320
  const firstUserMessage = entry.messages.find((m) => m.role === "user");
19898
20321
  const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
@@ -19918,11 +20341,11 @@ function toTranscriptJsonLine(entry) {
19918
20341
  };
19919
20342
  }
19920
20343
  async function readTranscriptJsonl(filePath) {
19921
- const text = await readFile16(filePath, "utf8");
20344
+ const text = await readFile17(filePath, "utf8");
19922
20345
  return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
19923
20346
  }
19924
20347
  async function readTranscriptFile(filePath) {
19925
- return readFile16(filePath, "utf8");
20348
+ return readFile17(filePath, "utf8");
19926
20349
  }
19927
20350
 
19928
20351
  // src/import/transcript-provider.ts
@@ -19987,6 +20410,7 @@ export {
19987
20410
  DEFAULT_EXPLORATION_TOOLS,
19988
20411
  DEFAULT_THRESHOLD,
19989
20412
  DeterministicAssertionEvaluator,
20413
+ DockerWorkspaceProvider,
19990
20414
  EvaluatorRegistry,
19991
20415
  ExecutionMetricsEvaluator,
19992
20416
  FieldAccuracyEvaluator,
@@ -20022,9 +20446,11 @@ export {
20022
20446
  buildSearchRoots,
20023
20447
  calculateRubricScore,
20024
20448
  captureFileChanges,
20449
+ checkoutResultsRepoBranch,
20025
20450
  clampScore,
20026
20451
  cleanupEvalWorkspaces,
20027
20452
  cleanupWorkspace,
20453
+ commitAndPushResultsBranch,
20028
20454
  computeTraceSummary,
20029
20455
  computeWorkspaceFingerprint,
20030
20456
  consumeClaudeLogEntries,
@@ -20035,6 +20461,7 @@ export {
20035
20461
  createAgentKernel,
20036
20462
  createBuiltinProviderRegistry,
20037
20463
  createBuiltinRegistry,
20464
+ createDraftResultsPr,
20038
20465
  createProvider,
20039
20466
  createTempWorkspace,
20040
20467
  deepEqual,
@@ -20042,6 +20469,7 @@ export {
20042
20469
  deriveCategory,
20043
20470
  deriveProjectId,
20044
20471
  detectFormat,
20472
+ directorySizeBytes,
20045
20473
  discoverAssertions,
20046
20474
  discoverClaudeSessions,
20047
20475
  discoverCodexSessions,
@@ -20050,6 +20478,7 @@ export {
20050
20478
  discoverGraders as discoverJudges,
20051
20479
  discoverProjects,
20052
20480
  discoverProviders,
20481
+ ensureResultsRepoClone,
20053
20482
  ensureVSCodeSubagents,
20054
20483
  evaluate,
20055
20484
  executeScript,
@@ -20074,6 +20503,8 @@ export {
20074
20503
  getOutputFilenames,
20075
20504
  getProject,
20076
20505
  getProjectsRegistryPath,
20506
+ getResultsRepoCachePaths,
20507
+ getResultsRepoStatus,
20077
20508
  getSubagentsRoot,
20078
20509
  getTextContent,
20079
20510
  getTraceStateRoot,
@@ -20103,12 +20534,15 @@ export {
20103
20534
  mergeExecutionMetrics,
20104
20535
  negateScore,
20105
20536
  normalizeLineEndings,
20537
+ normalizeResultsExportConfig,
20106
20538
  parseAgentSkillsEvals,
20107
20539
  parseClaudeSession,
20108
20540
  parseCodexSession,
20109
20541
  parseCopilotEvents,
20110
20542
  parseJsonFromText,
20111
20543
  parseJsonSafe,
20544
+ prepareResultsRepoBranch,
20545
+ pushResultsRepoBranch,
20112
20546
  readJsonFile,
20113
20547
  readTargetDefinitions,
20114
20548
  readTestSuiteMetadata,
@@ -20119,6 +20553,8 @@ export {
20119
20553
  resolveAndCreateProvider,
20120
20554
  resolveDelegatedTargetDefinition,
20121
20555
  resolveFileReference,
20556
+ resolveResultsRepoRunsDir,
20557
+ resolveResultsRepoUrl,
20122
20558
  resolveTargetDefinition,
20123
20559
  resolveWorkspaceTemplate,
20124
20560
  rubricEvaluationSchema,
@@ -20140,12 +20576,14 @@ export {
20140
20576
  scoreToVerdict,
20141
20577
  shouldEnableCache,
20142
20578
  shouldSkipCacheForTemperature,
20579
+ stageResultsArtifacts,
20143
20580
  subscribeToClaudeLogEntries,
20144
20581
  subscribeToCodexLogEntries,
20145
20582
  subscribeToCopilotCliLogEntries,
20146
20583
  subscribeToCopilotSdkLogEntries,
20147
20584
  subscribeToPiLogEntries,
20148
20585
  substituteVariables,
20586
+ syncResultsRepo,
20149
20587
  toCamelCaseDeep,
20150
20588
  toSnakeCaseDeep,
20151
20589
  toTranscriptJsonLine,