@agentv/core 4.10.0 → 4.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -37,6 +37,161 @@ var init_cjs_shims = __esm({
37
37
  }
38
38
  });
39
39
 
40
+ // src/runtime/exec.ts
41
+ var exec_exports = {};
42
+ __export(exec_exports, {
43
+ execFileWithStdin: () => execFileWithStdin,
44
+ execShellWithStdin: () => execShellWithStdin
45
+ });
46
+ function shellEscapePath(value) {
47
+ if (process.platform === "win32") {
48
+ return `"${value.replaceAll('"', '""')}"`;
49
+ }
50
+ return `'${value.replaceAll("'", `'"'"'`)}'`;
51
+ }
52
+ async function execFileWithStdin(argv, stdinPayload, options = {}) {
53
+ if (argv.length === 0) {
54
+ throw new Error("Executable argv must include at least one entry");
55
+ }
56
+ if (typeof Bun !== "undefined") {
57
+ return execFileWithStdinBun(argv, stdinPayload, options);
58
+ }
59
+ return execFileWithStdinNode(argv, stdinPayload, options);
60
+ }
61
+ async function execFileWithStdinBun(argv, stdinPayload, options) {
62
+ const command = [...argv];
63
+ const encoder = new TextEncoder();
64
+ const proc = Bun.spawn(command, {
65
+ cwd: options.cwd,
66
+ stdin: encoder.encode(stdinPayload),
67
+ stdout: "pipe",
68
+ stderr: "pipe",
69
+ // Merge additional env vars with process.env
70
+ env: options.env ? { ...process.env, ...options.env } : process.env
71
+ });
72
+ let timedOut = false;
73
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
74
+ timedOut = true;
75
+ proc.kill("SIGKILL");
76
+ }, options.timeoutMs) : void 0;
77
+ try {
78
+ const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
79
+ const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
80
+ const [stdout, stderr, exitCode] = await Promise.all([
81
+ stdoutPromise,
82
+ stderrPromise,
83
+ proc.exited
84
+ ]);
85
+ if (timedOut) {
86
+ throw new Error(`Process timed out after ${options.timeoutMs}ms`);
87
+ }
88
+ return {
89
+ stdout: stdout.replace(/\r\n/g, "\n"),
90
+ stderr: stderr.replace(/\r\n/g, "\n"),
91
+ exitCode
92
+ };
93
+ } finally {
94
+ if (timeout !== void 0) {
95
+ clearTimeout(timeout);
96
+ }
97
+ }
98
+ }
99
+ async function execFileWithStdinNode(argv, stdinPayload, options) {
100
+ const { spawn: spawn5 } = await import("child_process");
101
+ return new Promise((resolve, reject) => {
102
+ const [cmd, ...args] = argv;
103
+ const child = spawn5(cmd, args, {
104
+ cwd: options.cwd,
105
+ stdio: ["pipe", "pipe", "pipe"],
106
+ // Merge additional env vars with process.env
107
+ env: options.env ? { ...process.env, ...options.env } : process.env
108
+ });
109
+ const stdoutChunks = [];
110
+ const stderrChunks = [];
111
+ child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
112
+ child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
113
+ let timedOut = false;
114
+ const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
115
+ timedOut = true;
116
+ child.kill("SIGKILL");
117
+ }, options.timeoutMs) : void 0;
118
+ child.on("error", (error) => {
119
+ if (timeout !== void 0) clearTimeout(timeout);
120
+ reject(error);
121
+ });
122
+ child.on("close", (code) => {
123
+ if (timeout !== void 0) clearTimeout(timeout);
124
+ if (timedOut) {
125
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
126
+ return;
127
+ }
128
+ const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
129
+ const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
130
+ resolve({
131
+ stdout,
132
+ stderr,
133
+ exitCode: code ?? 0
134
+ });
135
+ });
136
+ if (child.stdin) {
137
+ child.stdin.write(stdinPayload);
138
+ child.stdin.end();
139
+ }
140
+ });
141
+ }
142
+ async function execShellWithStdin(command, stdinPayload, options = {}) {
143
+ const { mkdir: mkdir17, readFile: readFile20, rm: rm7, writeFile: writeFile9 } = await import("fs/promises");
144
+ const { tmpdir: tmpdir3 } = await import("os");
145
+ const path56 = await import("path");
146
+ const { randomUUID: randomUUID10 } = await import("crypto");
147
+ const dir = path56.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
148
+ await mkdir17(dir, { recursive: true });
149
+ const stdinPath = path56.join(dir, "stdin.txt");
150
+ const stdoutPath = path56.join(dir, "stdout.txt");
151
+ const stderrPath = path56.join(dir, "stderr.txt");
152
+ await writeFile9(stdinPath, stdinPayload, "utf8");
153
+ const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
154
+ const { spawn: spawn5 } = await import("child_process");
155
+ try {
156
+ const exitCode = await new Promise((resolve, reject) => {
157
+ const child = spawn5(wrappedCommand, {
158
+ shell: true,
159
+ cwd: options.cwd,
160
+ stdio: ["ignore", "ignore", "ignore"],
161
+ // Merge additional env vars with process.env
162
+ env: options.env ? { ...process.env, ...options.env } : process.env
163
+ });
164
+ const timeout = options.timeoutMs ? setTimeout(() => {
165
+ child.kill();
166
+ reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
167
+ }, options.timeoutMs) : void 0;
168
+ child.on("error", (error) => {
169
+ if (timeout !== void 0) {
170
+ clearTimeout(timeout);
171
+ }
172
+ reject(error);
173
+ });
174
+ child.on("exit", (code) => {
175
+ if (timeout !== void 0) {
176
+ clearTimeout(timeout);
177
+ }
178
+ resolve(code ?? 0);
179
+ });
180
+ });
181
+ const stdout = (await readFile20(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
182
+ const stderr = (await readFile20(stderrPath, "utf8")).replace(/\r\n/g, "\n");
183
+ return { stdout, stderr, exitCode };
184
+ } finally {
185
+ await rm7(dir, { recursive: true, force: true });
186
+ }
187
+ }
188
+ var init_exec = __esm({
189
+ "src/runtime/exec.ts"() {
190
+ "use strict";
191
+ init_cjs_shims();
192
+ }
193
+ });
194
+
40
195
  // src/evaluation/providers/agentv-provider.ts
41
196
  var agentv_provider_exports = {};
42
197
  __export(agentv_provider_exports, {
@@ -109,6 +264,194 @@ var init_agentv_provider = __esm({
109
264
  }
110
265
  });
111
266
 
267
+ // src/evaluation/workspace/docker-workspace.ts
268
+ var docker_workspace_exports = {};
269
+ __export(docker_workspace_exports, {
270
+ DefaultCommandExecutor: () => DefaultCommandExecutor,
271
+ DockerWorkspaceProvider: () => DockerWorkspaceProvider
272
+ });
273
+ function buildGitCommand(target, args) {
274
+ if (!target?.path) {
275
+ return ["git", ...args];
276
+ }
277
+ return ["git", "-C", target.path, ...args];
278
+ }
279
+ var DefaultCommandExecutor, DEFAULT_TIMEOUT_S, DockerWorkspaceProvider;
280
+ var init_docker_workspace = __esm({
281
+ "src/evaluation/workspace/docker-workspace.ts"() {
282
+ "use strict";
283
+ init_cjs_shims();
284
+ DefaultCommandExecutor = class {
285
+ async exec(argv, options = {}) {
286
+ const { execFileWithStdin: execFileWithStdin2 } = await Promise.resolve().then(() => (init_exec(), exec_exports));
287
+ return execFileWithStdin2(argv, options.stdin ?? "", {
288
+ timeoutMs: options.timeoutMs
289
+ });
290
+ }
291
+ };
292
+ DEFAULT_TIMEOUT_S = 1800;
293
+ DockerWorkspaceProvider = class {
294
+ config;
295
+ executor;
296
+ timeoutMs;
297
+ constructor(config, executor) {
298
+ this.config = config;
299
+ this.executor = executor ?? new DefaultCommandExecutor();
300
+ this.timeoutMs = (config.timeout ?? DEFAULT_TIMEOUT_S) * 1e3;
301
+ }
302
+ /** Check whether the Docker CLI is available on the host. */
303
+ async isDockerAvailable() {
304
+ try {
305
+ const result = await this.executor.exec(
306
+ ["docker", "version", "--format", "{{.Server.Version}}"],
307
+ {
308
+ timeoutMs: 1e4
309
+ }
310
+ );
311
+ return result.exitCode === 0;
312
+ } catch {
313
+ return false;
314
+ }
315
+ }
316
+ /** Pull the configured Docker image. No-op if already cached locally. */
317
+ async pullImage() {
318
+ const inspectResult = await this.executor.exec(
319
+ ["docker", "image", "inspect", this.config.image],
320
+ {
321
+ timeoutMs: 1e4
322
+ }
323
+ );
324
+ if (inspectResult.exitCode === 0) {
325
+ return;
326
+ }
327
+ const result = await this.executor.exec(["docker", "pull", this.config.image], {
328
+ timeoutMs: this.timeoutMs
329
+ });
330
+ if (result.exitCode !== 0) {
331
+ throw new Error(`docker pull failed (exit ${result.exitCode}): ${result.stderr.trim()}`);
332
+ }
333
+ }
334
+ /** Create a stopped container from the configured image with resource limits. Returns container ID. */
335
+ async createContainer() {
336
+ const argv = ["docker", "create"];
337
+ if (this.config.memory) {
338
+ argv.push(`--memory=${this.config.memory}`);
339
+ }
340
+ if (this.config.cpus !== void 0) {
341
+ argv.push(`--cpus=${this.config.cpus}`);
342
+ }
343
+ argv.push(this.config.image, "sleep", "infinity");
344
+ const result = await this.executor.exec(argv, { timeoutMs: 3e4 });
345
+ if (result.exitCode !== 0) {
346
+ throw new Error(`docker create failed (exit ${result.exitCode}): ${result.stderr.trim()}`);
347
+ }
348
+ return result.stdout.trim();
349
+ }
350
+ /** Start a previously created container. */
351
+ async startContainer(containerId) {
352
+ const result = await this.executor.exec(["docker", "start", containerId], {
353
+ timeoutMs: 3e4
354
+ });
355
+ if (result.exitCode !== 0) {
356
+ throw new Error(`docker start failed (exit ${result.exitCode}): ${result.stderr.trim()}`);
357
+ }
358
+ }
359
+ /**
360
+ * Reset the container checkout to the specified target refs, if any.
361
+ * This is used for SWE-bench images where the repo state must match the
362
+ * dataset's base snapshot before grading begins.
363
+ */
364
+ async resetContainerCheckout(containerId, repoCheckouts) {
365
+ if (!repoCheckouts || repoCheckouts.length === 0) {
366
+ return;
367
+ }
368
+ for (const target of repoCheckouts) {
369
+ const resetResult = await this.execInContainer({
370
+ containerId,
371
+ command: buildGitCommand(target, ["reset", "--hard", target.ref])
372
+ });
373
+ if (resetResult.exitCode !== 0) {
374
+ throw new Error(
375
+ `docker git reset failed (exit ${resetResult.exitCode}): ${resetResult.stderr.trim()}`
376
+ );
377
+ }
378
+ const verifyResult = await this.execInContainer({
379
+ containerId,
380
+ command: buildGitCommand(target, ["rev-parse", "HEAD"]),
381
+ timeoutMs: 3e4
382
+ });
383
+ if (verifyResult.exitCode !== 0) {
384
+ throw new Error(
385
+ `docker checkout verification failed (exit ${verifyResult.exitCode}): ${verifyResult.stderr.trim()}`
386
+ );
387
+ }
388
+ const head = verifyResult.stdout.trim();
389
+ if (head !== target.ref) {
390
+ throw new Error(
391
+ `docker checkout verification failed: expected ${target.ref} but found ${head || "<empty>"}`
392
+ );
393
+ }
394
+ }
395
+ }
396
+ /** Copy a local file or directory into a running container. */
397
+ async copyToContainer(containerId, localPath, containerPath) {
398
+ const result = await this.executor.exec(
399
+ ["docker", "cp", localPath, `${containerId}:${containerPath}`],
400
+ { timeoutMs: 6e4 }
401
+ );
402
+ if (result.exitCode !== 0) {
403
+ throw new Error(`docker cp failed (exit ${result.exitCode}): ${result.stderr.trim()}`);
404
+ }
405
+ }
406
+ /**
407
+ * Execute a command inside a running container.
408
+ * If stdin is provided, it is piped via `docker exec -i`.
409
+ */
410
+ async execInContainer(options) {
411
+ const { containerId, command, timeoutMs, stdin } = options;
412
+ const argv = ["docker", "exec"];
413
+ if (stdin !== void 0) {
414
+ argv.push("-i");
415
+ }
416
+ argv.push(containerId, ...command);
417
+ return this.executor.exec(argv, {
418
+ timeoutMs: timeoutMs ?? this.timeoutMs,
419
+ stdin
420
+ });
421
+ }
422
+ /** Force-remove a container (always succeeds, even if container doesn't exist). */
423
+ async removeContainer(containerId) {
424
+ try {
425
+ await this.executor.exec(["docker", "rm", "-f", containerId], {
426
+ timeoutMs: 3e4
427
+ });
428
+ } catch {
429
+ }
430
+ }
431
+ /** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */
432
+ async runGraderInContainer(options) {
433
+ const containerId = await this.createContainer();
434
+ try {
435
+ await this.startContainer(containerId);
436
+ await this.resetContainerCheckout(containerId, options.repoCheckouts);
437
+ if (options.copyFiles) {
438
+ for (const file of options.copyFiles) {
439
+ await this.copyToContainer(containerId, file.localPath, file.containerPath);
440
+ }
441
+ }
442
+ return await this.execInContainer({
443
+ containerId,
444
+ command: options.command,
445
+ stdin: options.stdin
446
+ });
447
+ } finally {
448
+ await this.removeContainer(containerId);
449
+ }
450
+ }
451
+ };
452
+ }
453
+ });
454
+
112
455
  // ../../node_modules/.bun/@opentelemetry+core@2.5.1+460773ef8ff1e07c/node_modules/@opentelemetry/core/build/esm/trace/suppress-tracing.js
113
456
  function suppressTracing(context2) {
114
457
  return context2.setValue(SUPPRESS_TRACING_KEY, true);
@@ -1355,13 +1698,13 @@ function serializeAttributeValue(value) {
1355
1698
  if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
1356
1699
  return { stringValue: String(value) };
1357
1700
  }
1358
- var import_promises37, import_node_path54, OtlpJsonFileExporter;
1701
+ var import_promises39, import_node_path55, OtlpJsonFileExporter;
1359
1702
  var init_otlp_json_file_exporter = __esm({
1360
1703
  "src/observability/otlp-json-file-exporter.ts"() {
1361
1704
  "use strict";
1362
1705
  init_cjs_shims();
1363
- import_promises37 = require("fs/promises");
1364
- import_node_path54 = require("path");
1706
+ import_promises39 = require("fs/promises");
1707
+ import_node_path55 = require("path");
1365
1708
  OtlpJsonFileExporter = class {
1366
1709
  // biome-ignore lint/suspicious/noExplicitAny: serialized span data
1367
1710
  spans = [];
@@ -1400,7 +1743,7 @@ var init_otlp_json_file_exporter = __esm({
1400
1743
  }
1401
1744
  async flush() {
1402
1745
  if (this.spans.length === 0) return;
1403
- await (0, import_promises37.mkdir)((0, import_node_path54.dirname)(this.filePath), { recursive: true });
1746
+ await (0, import_promises39.mkdir)((0, import_node_path55.dirname)(this.filePath), { recursive: true });
1404
1747
  const otlpJson = {
1405
1748
  resourceSpans: [
1406
1749
  {
@@ -1434,6 +1777,7 @@ __export(index_exports, {
1434
1777
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
1435
1778
  DEFAULT_THRESHOLD: () => DEFAULT_THRESHOLD,
1436
1779
  DeterministicAssertionEvaluator: () => DeterministicAssertionEvaluator,
1780
+ DockerWorkspaceProvider: () => DockerWorkspaceProvider,
1437
1781
  EvaluatorRegistry: () => EvaluatorRegistry,
1438
1782
  ExecutionMetricsEvaluator: () => ExecutionMetricsEvaluator,
1439
1783
  FieldAccuracyEvaluator: () => FieldAccuracyEvaluator,
@@ -1469,9 +1813,11 @@ __export(index_exports, {
1469
1813
  buildSearchRoots: () => buildSearchRoots2,
1470
1814
  calculateRubricScore: () => calculateRubricScore,
1471
1815
  captureFileChanges: () => captureFileChanges,
1816
+ checkoutResultsRepoBranch: () => checkoutResultsRepoBranch,
1472
1817
  clampScore: () => clampScore,
1473
1818
  cleanupEvalWorkspaces: () => cleanupEvalWorkspaces,
1474
1819
  cleanupWorkspace: () => cleanupWorkspace,
1820
+ commitAndPushResultsBranch: () => commitAndPushResultsBranch,
1475
1821
  computeTraceSummary: () => computeTraceSummary,
1476
1822
  computeWorkspaceFingerprint: () => computeWorkspaceFingerprint,
1477
1823
  consumeClaudeLogEntries: () => consumeClaudeLogEntries,
@@ -1482,6 +1828,7 @@ __export(index_exports, {
1482
1828
  createAgentKernel: () => createAgentKernel,
1483
1829
  createBuiltinProviderRegistry: () => createBuiltinProviderRegistry,
1484
1830
  createBuiltinRegistry: () => createBuiltinRegistry,
1831
+ createDraftResultsPr: () => createDraftResultsPr,
1485
1832
  createProvider: () => createProvider,
1486
1833
  createTempWorkspace: () => createTempWorkspace,
1487
1834
  deepEqual: () => deepEqual,
@@ -1489,6 +1836,7 @@ __export(index_exports, {
1489
1836
  deriveCategory: () => deriveCategory,
1490
1837
  deriveProjectId: () => deriveProjectId,
1491
1838
  detectFormat: () => detectFormat,
1839
+ directorySizeBytes: () => directorySizeBytes,
1492
1840
  discoverAssertions: () => discoverAssertions,
1493
1841
  discoverClaudeSessions: () => discoverClaudeSessions,
1494
1842
  discoverCodexSessions: () => discoverCodexSessions,
@@ -1497,6 +1845,7 @@ __export(index_exports, {
1497
1845
  discoverJudges: () => discoverGraders,
1498
1846
  discoverProjects: () => discoverProjects,
1499
1847
  discoverProviders: () => discoverProviders,
1848
+ ensureResultsRepoClone: () => ensureResultsRepoClone,
1500
1849
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
1501
1850
  evaluate: () => evaluate,
1502
1851
  executeScript: () => executeScript,
@@ -1521,6 +1870,8 @@ __export(index_exports, {
1521
1870
  getOutputFilenames: () => getOutputFilenames,
1522
1871
  getProject: () => getProject,
1523
1872
  getProjectsRegistryPath: () => getProjectsRegistryPath,
1873
+ getResultsRepoCachePaths: () => getResultsRepoCachePaths,
1874
+ getResultsRepoStatus: () => getResultsRepoStatus,
1524
1875
  getSubagentsRoot: () => getSubagentsRoot,
1525
1876
  getTextContent: () => getTextContent,
1526
1877
  getTraceStateRoot: () => getTraceStateRoot,
@@ -1550,12 +1901,15 @@ __export(index_exports, {
1550
1901
  mergeExecutionMetrics: () => mergeExecutionMetrics,
1551
1902
  negateScore: () => negateScore,
1552
1903
  normalizeLineEndings: () => normalizeLineEndings,
1904
+ normalizeResultsExportConfig: () => normalizeResultsExportConfig,
1553
1905
  parseAgentSkillsEvals: () => parseAgentSkillsEvals,
1554
1906
  parseClaudeSession: () => parseClaudeSession,
1555
1907
  parseCodexSession: () => parseCodexSession,
1556
1908
  parseCopilotEvents: () => parseCopilotEvents,
1557
1909
  parseJsonFromText: () => parseJsonFromText,
1558
1910
  parseJsonSafe: () => parseJsonSafe,
1911
+ prepareResultsRepoBranch: () => prepareResultsRepoBranch,
1912
+ pushResultsRepoBranch: () => pushResultsRepoBranch,
1559
1913
  readJsonFile: () => readJsonFile,
1560
1914
  readTargetDefinitions: () => readTargetDefinitions,
1561
1915
  readTestSuiteMetadata: () => readTestSuiteMetadata,
@@ -1566,6 +1920,8 @@ __export(index_exports, {
1566
1920
  resolveAndCreateProvider: () => resolveAndCreateProvider,
1567
1921
  resolveDelegatedTargetDefinition: () => resolveDelegatedTargetDefinition,
1568
1922
  resolveFileReference: () => resolveFileReference3,
1923
+ resolveResultsRepoRunsDir: () => resolveResultsRepoRunsDir,
1924
+ resolveResultsRepoUrl: () => resolveResultsRepoUrl,
1569
1925
  resolveTargetDefinition: () => resolveTargetDefinition,
1570
1926
  resolveWorkspaceTemplate: () => resolveWorkspaceTemplate,
1571
1927
  rubricEvaluationSchema: () => rubricEvaluationSchema,
@@ -1587,12 +1943,14 @@ __export(index_exports, {
1587
1943
  scoreToVerdict: () => scoreToVerdict,
1588
1944
  shouldEnableCache: () => shouldEnableCache,
1589
1945
  shouldSkipCacheForTemperature: () => shouldSkipCacheForTemperature,
1946
+ stageResultsArtifacts: () => stageResultsArtifacts,
1590
1947
  subscribeToClaudeLogEntries: () => subscribeToClaudeLogEntries,
1591
1948
  subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
1592
1949
  subscribeToCopilotCliLogEntries: () => subscribeToCopilotCliLogEntries,
1593
1950
  subscribeToCopilotSdkLogEntries: () => subscribeToCopilotSdkLogEntries,
1594
1951
  subscribeToPiLogEntries: () => subscribeToPiLogEntries,
1595
1952
  substituteVariables: () => substituteVariables,
1953
+ syncResultsRepo: () => syncResultsRepo,
1596
1954
  toCamelCaseDeep: () => toCamelCaseDeep,
1597
1955
  toSnakeCaseDeep: () => toSnakeCaseDeep,
1598
1956
  toTranscriptJsonLine: () => toTranscriptJsonLine,
@@ -1829,10 +2187,10 @@ function mergeExecutionMetrics(computed, metrics) {
1829
2187
 
1830
2188
  // src/evaluation/yaml-parser.ts
1831
2189
  init_cjs_shims();
1832
- var import_promises9 = require("fs/promises");
2190
+ var import_promises10 = require("fs/promises");
1833
2191
  var import_node_path9 = __toESM(require("path"), 1);
1834
2192
  var import_micromatch2 = __toESM(require("micromatch"), 1);
1835
- var import_yaml4 = require("yaml");
2193
+ var import_yaml5 = require("yaml");
1836
2194
 
1837
2195
  // src/evaluation/input-message-utils.ts
1838
2196
  init_cjs_shims();
@@ -2261,10 +2619,12 @@ async function loadConfig(evalFilePath, repoRoot) {
2261
2619
  parsed.execution,
2262
2620
  configPath
2263
2621
  );
2622
+ const results = parseResultsConfig(parsed.results, configPath);
2264
2623
  return {
2265
2624
  required_version: requiredVersion,
2266
2625
  eval_patterns: evalPatterns,
2267
- execution: executionDefaults
2626
+ execution: executionDefaults,
2627
+ results
2268
2628
  };
2269
2629
  } catch (error) {
2270
2630
  logWarning(
@@ -2499,166 +2859,77 @@ function parseExecutionDefaults(raw, configPath) {
2499
2859
  }
2500
2860
  return Object.keys(result).length > 0 ? result : void 0;
2501
2861
  }
2862
+ function parseResultsConfig(raw, configPath) {
2863
+ if (raw === void 0 || raw === null) {
2864
+ return void 0;
2865
+ }
2866
+ if (typeof raw !== "object" || Array.isArray(raw)) {
2867
+ logWarning(`Invalid results in ${configPath}, expected object`);
2868
+ return void 0;
2869
+ }
2870
+ const obj = raw;
2871
+ const exportConfig = parseResultsExportConfig(obj.export, configPath);
2872
+ if (!exportConfig) {
2873
+ return void 0;
2874
+ }
2875
+ return { export: exportConfig };
2876
+ }
2877
+ function parseResultsExportConfig(raw, configPath) {
2878
+ if (raw === void 0 || raw === null) {
2879
+ return void 0;
2880
+ }
2881
+ if (typeof raw !== "object" || Array.isArray(raw)) {
2882
+ logWarning(`Invalid results.export in ${configPath}, expected object`);
2883
+ return void 0;
2884
+ }
2885
+ const obj = raw;
2886
+ const repo = typeof obj.repo === "string" ? obj.repo.trim() : "";
2887
+ const exportPath = typeof obj.path === "string" ? obj.path.trim() : "";
2888
+ if (!repo) {
2889
+ logWarning(`Invalid results.export.repo in ${configPath}, expected non-empty string`);
2890
+ return void 0;
2891
+ }
2892
+ if (!exportPath) {
2893
+ logWarning(`Invalid results.export.path in ${configPath}, expected non-empty string`);
2894
+ return void 0;
2895
+ }
2896
+ if (obj.auto_push !== void 0 && typeof obj.auto_push !== "boolean") {
2897
+ logWarning(`Invalid results.export.auto_push in ${configPath}, expected boolean`);
2898
+ return void 0;
2899
+ }
2900
+ let branchPrefix;
2901
+ if (obj.branch_prefix !== void 0) {
2902
+ if (typeof obj.branch_prefix !== "string" || obj.branch_prefix.trim().length === 0) {
2903
+ logWarning(
2904
+ `Invalid results.export.branch_prefix in ${configPath}, expected non-empty string`
2905
+ );
2906
+ return void 0;
2907
+ }
2908
+ branchPrefix = obj.branch_prefix.trim();
2909
+ }
2910
+ return {
2911
+ repo,
2912
+ path: exportPath,
2913
+ ...typeof obj.auto_push === "boolean" && { auto_push: obj.auto_push },
2914
+ ...branchPrefix && { branch_prefix: branchPrefix }
2915
+ };
2916
+ }
2502
2917
  function logWarning(message) {
2503
2918
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
2504
2919
  }
2505
2920
 
2506
2921
  // src/evaluation/loaders/evaluator-parser.ts
2507
2922
  init_cjs_shims();
2923
+ var import_promises7 = require("fs/promises");
2508
2924
  var import_node_path6 = __toESM(require("path"), 1);
2925
+ var import_yaml3 = require("yaml");
2509
2926
 
2510
2927
  // src/evaluation/content-preprocessor.ts
2511
2928
  init_cjs_shims();
2512
2929
  var import_promises5 = require("fs/promises");
2513
2930
  var import_node_path5 = __toESM(require("path"), 1);
2514
2931
  var import_node_url2 = require("url");
2515
-
2516
- // src/runtime/exec.ts
2517
- init_cjs_shims();
2518
- function shellEscapePath(value) {
2519
- if (process.platform === "win32") {
2520
- return `"${value.replaceAll('"', '""')}"`;
2521
- }
2522
- return `'${value.replaceAll("'", `'"'"'`)}'`;
2523
- }
2524
- async function execFileWithStdin(argv, stdinPayload, options = {}) {
2525
- if (argv.length === 0) {
2526
- throw new Error("Executable argv must include at least one entry");
2527
- }
2528
- if (typeof Bun !== "undefined") {
2529
- return execFileWithStdinBun(argv, stdinPayload, options);
2530
- }
2531
- return execFileWithStdinNode(argv, stdinPayload, options);
2532
- }
2533
- async function execFileWithStdinBun(argv, stdinPayload, options) {
2534
- const command = [...argv];
2535
- const encoder = new TextEncoder();
2536
- const proc = Bun.spawn(command, {
2537
- cwd: options.cwd,
2538
- stdin: encoder.encode(stdinPayload),
2539
- stdout: "pipe",
2540
- stderr: "pipe",
2541
- // Merge additional env vars with process.env
2542
- env: options.env ? { ...process.env, ...options.env } : process.env
2543
- });
2544
- let timedOut = false;
2545
- const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
2546
- timedOut = true;
2547
- proc.kill("SIGKILL");
2548
- }, options.timeoutMs) : void 0;
2549
- try {
2550
- const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
2551
- const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
2552
- const [stdout, stderr, exitCode] = await Promise.all([
2553
- stdoutPromise,
2554
- stderrPromise,
2555
- proc.exited
2556
- ]);
2557
- if (timedOut) {
2558
- throw new Error(`Process timed out after ${options.timeoutMs}ms`);
2559
- }
2560
- return {
2561
- stdout: stdout.replace(/\r\n/g, "\n"),
2562
- stderr: stderr.replace(/\r\n/g, "\n"),
2563
- exitCode
2564
- };
2565
- } finally {
2566
- if (timeout !== void 0) {
2567
- clearTimeout(timeout);
2568
- }
2569
- }
2570
- }
2571
- async function execFileWithStdinNode(argv, stdinPayload, options) {
2572
- const { spawn: spawn5 } = await import("child_process");
2573
- return new Promise((resolve, reject) => {
2574
- const [cmd, ...args] = argv;
2575
- const child = spawn5(cmd, args, {
2576
- cwd: options.cwd,
2577
- stdio: ["pipe", "pipe", "pipe"],
2578
- // Merge additional env vars with process.env
2579
- env: options.env ? { ...process.env, ...options.env } : process.env
2580
- });
2581
- const stdoutChunks = [];
2582
- const stderrChunks = [];
2583
- child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
2584
- child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
2585
- let timedOut = false;
2586
- const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
2587
- timedOut = true;
2588
- child.kill("SIGKILL");
2589
- }, options.timeoutMs) : void 0;
2590
- child.on("error", (error) => {
2591
- if (timeout !== void 0) clearTimeout(timeout);
2592
- reject(error);
2593
- });
2594
- child.on("close", (code) => {
2595
- if (timeout !== void 0) clearTimeout(timeout);
2596
- if (timedOut) {
2597
- reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
2598
- return;
2599
- }
2600
- const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
2601
- const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
2602
- resolve({
2603
- stdout,
2604
- stderr,
2605
- exitCode: code ?? 0
2606
- });
2607
- });
2608
- if (child.stdin) {
2609
- child.stdin.write(stdinPayload);
2610
- child.stdin.end();
2611
- }
2612
- });
2613
- }
2614
- async function execShellWithStdin(command, stdinPayload, options = {}) {
2615
- const { mkdir: mkdir17, readFile: readFile19, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
2616
- const { tmpdir: tmpdir3 } = await import("os");
2617
- const path55 = await import("path");
2618
- const { randomUUID: randomUUID10 } = await import("crypto");
2619
- const dir = path55.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
2620
- await mkdir17(dir, { recursive: true });
2621
- const stdinPath = path55.join(dir, "stdin.txt");
2622
- const stdoutPath = path55.join(dir, "stdout.txt");
2623
- const stderrPath = path55.join(dir, "stderr.txt");
2624
- await writeFile9(stdinPath, stdinPayload, "utf8");
2625
- const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
2626
- const { spawn: spawn5 } = await import("child_process");
2627
- try {
2628
- const exitCode = await new Promise((resolve, reject) => {
2629
- const child = spawn5(wrappedCommand, {
2630
- shell: true,
2631
- cwd: options.cwd,
2632
- stdio: ["ignore", "ignore", "ignore"],
2633
- // Merge additional env vars with process.env
2634
- env: options.env ? { ...process.env, ...options.env } : process.env
2635
- });
2636
- const timeout = options.timeoutMs ? setTimeout(() => {
2637
- child.kill();
2638
- reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
2639
- }, options.timeoutMs) : void 0;
2640
- child.on("error", (error) => {
2641
- if (timeout !== void 0) {
2642
- clearTimeout(timeout);
2643
- }
2644
- reject(error);
2645
- });
2646
- child.on("exit", (code) => {
2647
- if (timeout !== void 0) {
2648
- clearTimeout(timeout);
2649
- }
2650
- resolve(code ?? 0);
2651
- });
2652
- });
2653
- const stdout = (await readFile19(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
2654
- const stderr = (await readFile19(stderrPath, "utf8")).replace(/\r\n/g, "\n");
2655
- return { stdout, stderr, exitCode };
2656
- } finally {
2657
- await rm6(dir, { recursive: true, force: true });
2658
- }
2659
- }
2660
-
2661
- // src/evaluation/content-preprocessor.ts
2932
+ init_exec();
2662
2933
  var MIME_TYPE_ALIASES = {
2663
2934
  csv: "text/csv",
2664
2935
  docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -2901,6 +3172,7 @@ function validateTemplateVariables(content, source) {
2901
3172
  // src/evaluation/loaders/evaluator-parser.ts
2902
3173
  var ANSI_YELLOW4 = "\x1B[33m";
2903
3174
  var ANSI_RESET5 = "\x1B[0m";
3175
+ var MAX_ASSERTION_INCLUDE_DEPTH = 3;
2904
3176
  var PROMPT_FILE_PREFIX = "file://";
2905
3177
  function normalizeEvaluatorType(type) {
2906
3178
  return type.replace(/_/g, "-");
@@ -2933,7 +3205,79 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
2933
3205
  const evaluators = [...parsedCase ?? [], ...parsedRoot ?? []];
2934
3206
  return evaluators.length > 0 ? evaluators : void 0;
2935
3207
  }
2936
- async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
3208
+ function isIncludeEntry(value) {
3209
+ return isJsonObject2(value) && typeof value.include === "string" && Object.keys(value).length === 1;
3210
+ }
3211
+ function isTemplateReference(value) {
3212
+ return !value.startsWith(".") && !value.includes("/") && !value.includes("\\");
3213
+ }
3214
+ async function resolveAssertionTemplateReference(include, searchRoots) {
3215
+ const templateCandidates = isTemplateReference(include) ? [
3216
+ import_node_path6.default.join(".agentv", "templates", `${include}.yaml`),
3217
+ import_node_path6.default.join(".agentv", "templates", `${include}.yml`)
3218
+ ] : [include];
3219
+ const attempted = [];
3220
+ for (const candidate of templateCandidates) {
3221
+ const resolved = await resolveFileReference2(candidate, searchRoots);
3222
+ attempted.push(...resolved.attempted);
3223
+ if (resolved.resolvedPath) {
3224
+ return {
3225
+ displayPath: resolved.displayPath,
3226
+ resolvedPath: resolved.resolvedPath,
3227
+ attempted
3228
+ };
3229
+ }
3230
+ }
3231
+ return {
3232
+ displayPath: templateCandidates[0] ?? include,
3233
+ resolvedPath: "",
3234
+ attempted
3235
+ };
3236
+ }
3237
+ async function loadAssertionTemplateEntries(include, searchRoots, evalId, includeContext) {
3238
+ const nextDepth = includeContext.depth + 1;
3239
+ if (nextDepth > MAX_ASSERTION_INCLUDE_DEPTH) {
3240
+ const chain = [...includeContext.chain, include].join(" -> ");
3241
+ throw new Error(
3242
+ `Assertion template include depth exceeded ${MAX_ASSERTION_INCLUDE_DEPTH} in '${evalId}'. Include chain: ${chain}`
3243
+ );
3244
+ }
3245
+ const resolved = await resolveAssertionTemplateReference(include, searchRoots);
3246
+ if (!resolved.resolvedPath) {
3247
+ const attempted = resolved.attempted.length > 0 ? `
3248
+ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
3249
+ throw new Error(
3250
+ `Assertion template not found in '${evalId}': ${resolved.displayPath}${attempted}`
3251
+ );
3252
+ }
3253
+ if (includeContext.chain.includes(resolved.resolvedPath)) {
3254
+ const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
3255
+ throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
3256
+ }
3257
+ const content = await (0, import_promises7.readFile)(resolved.resolvedPath, "utf8");
3258
+ const parsed = interpolateEnv((0, import_yaml3.parse)(content), process.env);
3259
+ if (!isJsonObject2(parsed)) {
3260
+ throw new Error(
3261
+ `Invalid assertion template file in '${evalId}': ${resolved.resolvedPath} (expected a YAML object with an assertions array)`
3262
+ );
3263
+ }
3264
+ const assertions = parsed.assertions;
3265
+ if (!Array.isArray(assertions)) {
3266
+ throw new Error(
3267
+ `Invalid assertion template file in '${evalId}': ${resolved.resolvedPath} is missing a top-level assertions array`
3268
+ );
3269
+ }
3270
+ const templateDir = import_node_path6.default.dirname(resolved.resolvedPath);
3271
+ const nestedSearchRoots = [
3272
+ templateDir,
3273
+ ...searchRoots.filter((root) => import_node_path6.default.resolve(root) !== templateDir)
3274
+ ];
3275
+ return await expandEvaluatorEntries(assertions, nestedSearchRoots, evalId, {
3276
+ depth: nextDepth,
3277
+ chain: [...includeContext.chain, resolved.resolvedPath]
3278
+ }) ?? [];
3279
+ }
3280
+ async function expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
2937
3281
  if (candidateEvaluators === void 0) {
2938
3282
  return void 0;
2939
3283
  }
@@ -2941,13 +3285,34 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
2941
3285
  logWarning2(`Skipping evaluators for '${evalId}': expected array`);
2942
3286
  return void 0;
2943
3287
  }
2944
- const firstStringIndex = candidateEvaluators.findIndex((e) => typeof e === "string");
2945
- const processedEvaluators = firstStringIndex === -1 ? [...candidateEvaluators] : (() => {
3288
+ const expanded = [];
3289
+ for (const rawEvaluator of candidateEvaluators) {
3290
+ if (isIncludeEntry(rawEvaluator)) {
3291
+ const included = await loadAssertionTemplateEntries(
3292
+ rawEvaluator.include,
3293
+ searchRoots,
3294
+ evalId,
3295
+ includeContext
3296
+ );
3297
+ expanded.push(...included);
3298
+ continue;
3299
+ }
3300
+ expanded.push(rawEvaluator);
3301
+ }
3302
+ return expanded;
3303
+ }
3304
+ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
3305
+ const expandedEvaluators = await expandEvaluatorEntries(candidateEvaluators, searchRoots, evalId);
3306
+ if (!expandedEvaluators) {
3307
+ return void 0;
3308
+ }
3309
+ const firstStringIndex = expandedEvaluators.findIndex((e) => typeof e === "string");
3310
+ const processedEvaluators = firstStringIndex === -1 ? [...expandedEvaluators] : (() => {
2946
3311
  const PLACEHOLDER = Symbol("rubric-placeholder");
2947
3312
  const strings = [];
2948
3313
  const result = [];
2949
3314
  let rubricInserted = false;
2950
- for (const item of candidateEvaluators) {
3315
+ for (const item of expandedEvaluators) {
2951
3316
  if (typeof item === "string") {
2952
3317
  const trimmed = item.trim();
2953
3318
  if (trimmed.length === 0) {
@@ -3162,8 +3527,16 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
3162
3527
  );
3163
3528
  continue;
3164
3529
  }
3530
+ const expandedMembers = await expandEvaluatorEntries(
3531
+ rawMembers,
3532
+ searchRoots,
3533
+ `${evalId}:${name}`
3534
+ );
3535
+ if (!expandedMembers) {
3536
+ continue;
3537
+ }
3165
3538
  const memberEvaluators = [];
3166
- for (const rawMember of rawMembers) {
3539
+ for (const rawMember of expandedMembers) {
3167
3540
  if (!isJsonObject2(rawMember)) {
3168
3541
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
3169
3542
  continue;
@@ -4490,14 +4863,14 @@ function parseInlineRubrics(rawRubrics) {
4490
4863
 
4491
4864
  // src/evaluation/loaders/jsonl-parser.ts
4492
4865
  init_cjs_shims();
4493
- var import_promises8 = require("fs/promises");
4866
+ var import_promises9 = require("fs/promises");
4494
4867
  var import_node_path8 = __toESM(require("path"), 1);
4495
4868
  var import_micromatch = __toESM(require("micromatch"), 1);
4496
- var import_yaml3 = require("yaml");
4869
+ var import_yaml4 = require("yaml");
4497
4870
 
4498
4871
  // src/evaluation/loaders/message-processor.ts
4499
4872
  init_cjs_shims();
4500
- var import_promises7 = require("fs/promises");
4873
+ var import_promises8 = require("fs/promises");
4501
4874
  var import_node_path7 = __toESM(require("path"), 1);
4502
4875
 
4503
4876
  // src/evaluation/formatting/segment-formatter.ts
@@ -4615,7 +4988,7 @@ async function processMessages(options) {
4615
4988
  continue;
4616
4989
  }
4617
4990
  try {
4618
- const fileContent = (await (0, import_promises7.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
4991
+ const fileContent = (await (0, import_promises8.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
4619
4992
  processedContent.push({
4620
4993
  ...cloneJsonObject(rawSegment),
4621
4994
  path: displayPath,
@@ -4656,7 +5029,7 @@ async function processMessages(options) {
4656
5029
  continue;
4657
5030
  }
4658
5031
  try {
4659
- const imageBuffer = await (0, import_promises7.readFile)(resolvedPath);
5032
+ const imageBuffer = await (0, import_promises8.readFile)(resolvedPath);
4660
5033
  const base64 = imageBuffer.toString("base64");
4661
5034
  processedContent.push({
4662
5035
  type: "image",
@@ -4733,7 +5106,7 @@ async function processExpectedMessages(options) {
4733
5106
  continue;
4734
5107
  }
4735
5108
  try {
4736
- const fileContent = (await (0, import_promises7.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
5109
+ const fileContent = (await (0, import_promises8.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
4737
5110
  processedContent.push({
4738
5111
  type: "file",
4739
5112
  path: displayPath,
@@ -4773,7 +5146,7 @@ async function processExpectedMessages(options) {
4773
5146
  continue;
4774
5147
  }
4775
5148
  try {
4776
- const imageBuffer = await (0, import_promises7.readFile)(resolvedPath);
5149
+ const imageBuffer = await (0, import_promises8.readFile)(resolvedPath);
4777
5150
  const base64 = imageBuffer.toString("base64");
4778
5151
  processedContent.push({
4779
5152
  type: "image",
@@ -4902,8 +5275,8 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
4902
5275
  return {};
4903
5276
  }
4904
5277
  try {
4905
- const content = await (0, import_promises8.readFile)(sidecarPath, "utf8");
4906
- const parsed = interpolateEnv((0, import_yaml3.parse)(content), process.env);
5278
+ const content = await (0, import_promises9.readFile)(sidecarPath, "utf8");
5279
+ const parsed = interpolateEnv((0, import_yaml4.parse)(content), process.env);
4907
5280
  if (!isJsonObject(parsed)) {
4908
5281
  logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
4909
5282
  return {};
@@ -4947,7 +5320,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4947
5320
  const repoRootPath = resolveToAbsolutePath(repoRoot);
4948
5321
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
4949
5322
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
4950
- const rawFile = await (0, import_promises8.readFile)(absoluteTestPath, "utf8");
5323
+ const rawFile = await (0, import_promises9.readFile)(absoluteTestPath, "utf8");
4951
5324
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
4952
5325
  const fallbackSuiteName = import_node_path8.default.basename(absoluteTestPath, ".jsonl") || "eval";
4953
5326
  const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
@@ -5131,11 +5504,13 @@ function parseRepoCheckout(raw) {
5131
5504
  if (!isJsonObject(raw)) return void 0;
5132
5505
  const obj = raw;
5133
5506
  const ref = typeof obj.ref === "string" ? obj.ref : void 0;
5507
+ const baseCommit = typeof obj.base_commit === "string" ? obj.base_commit : void 0;
5134
5508
  const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
5135
5509
  const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
5136
- if (!ref && !resolve && ancestor === void 0) return void 0;
5510
+ if (!ref && !baseCommit && !resolve && ancestor === void 0) return void 0;
5137
5511
  return {
5138
5512
  ...ref !== void 0 && { ref },
5513
+ ...baseCommit !== void 0 && { base_commit: baseCommit },
5139
5514
  ...resolve !== void 0 && { resolve },
5140
5515
  ...ancestor !== void 0 && { ancestor }
5141
5516
  };
@@ -5158,12 +5533,12 @@ function parseRepoConfig(raw) {
5158
5533
  const obj = raw;
5159
5534
  const repoPath = typeof obj.path === "string" ? obj.path : void 0;
5160
5535
  const source = parseRepoSource(obj.source);
5161
- if (!repoPath || !source) return void 0;
5162
5536
  const checkout = parseRepoCheckout(obj.checkout);
5163
5537
  const clone = parseRepoClone(obj.clone);
5538
+ if (!repoPath && !source && !checkout && !clone) return void 0;
5164
5539
  return {
5165
- path: repoPath,
5166
- source,
5540
+ ...repoPath !== void 0 && { path: repoPath },
5541
+ ...source !== void 0 && { source },
5167
5542
  ...checkout !== void 0 && { checkout },
5168
5543
  ...clone !== void 0 && { clone }
5169
5544
  };
@@ -5215,7 +5590,8 @@ ${messageContent}`);
5215
5590
  segmentsByMessage,
5216
5591
  mode
5217
5592
  }) : void 0;
5218
- return { question, chatPrompt };
5593
+ const systemMessage = extractSystemMessage(testCase.input, segmentsByMessage, mode);
5594
+ return { question, chatPrompt, systemMessage };
5219
5595
  }
5220
5596
  function needsRoleMarkers(messages, processedSegmentsByMessage) {
5221
5597
  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
@@ -5229,6 +5605,26 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
5229
5605
  }
5230
5606
  return messagesWithContent > 1;
5231
5607
  }
5608
+ function extractSystemMessage(messages, segmentsByMessage, mode) {
5609
+ const systemParts = [];
5610
+ for (let i = 0; i < messages.length; i++) {
5611
+ if (messages[i].role !== "system") {
5612
+ break;
5613
+ }
5614
+ const segments = segmentsByMessage[i];
5615
+ const contentParts = [];
5616
+ for (const segment of segments) {
5617
+ const formatted = formatSegment(segment, mode);
5618
+ if (formatted) {
5619
+ contentParts.push(formatted);
5620
+ }
5621
+ }
5622
+ if (contentParts.length > 0) {
5623
+ systemParts.push(contentParts.join("\n"));
5624
+ }
5625
+ }
5626
+ return systemParts.length > 0 ? systemParts.join("\n\n") : void 0;
5627
+ }
5232
5628
  function buildChatPromptFromSegments(options) {
5233
5629
  const { messages, segmentsByMessage, systemPrompt, mode = "lm" } = options;
5234
5630
  if (messages.length === 0) {
@@ -5312,8 +5708,8 @@ function resolveTests(suite) {
5312
5708
  async function readTestSuiteMetadata(testFilePath) {
5313
5709
  try {
5314
5710
  const absolutePath = import_node_path9.default.resolve(testFilePath);
5315
- const content = await (0, import_promises9.readFile)(absolutePath, "utf8");
5316
- const parsed = interpolateEnv((0, import_yaml4.parse)(content), process.env);
5711
+ const content = await (0, import_promises10.readFile)(absolutePath, "utf8");
5712
+ const parsed = interpolateEnv((0, import_yaml5.parse)(content), process.env);
5317
5713
  if (!isJsonObject(parsed)) {
5318
5714
  return {};
5319
5715
  }
@@ -5370,8 +5766,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
5370
5766
  const repoRootPath = resolveToAbsolutePath(repoRoot);
5371
5767
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
5372
5768
  const config = await loadConfig(absoluteTestPath, repoRootPath);
5373
- const rawFile = await (0, import_promises9.readFile)(absoluteTestPath, "utf8");
5374
- const interpolated = interpolateEnv((0, import_yaml4.parse)(rawFile), process.env);
5769
+ const rawFile = await (0, import_promises10.readFile)(absoluteTestPath, "utf8");
5770
+ const interpolated = interpolateEnv((0, import_yaml5.parse)(rawFile), process.env);
5375
5771
  if (!isJsonObject(interpolated)) {
5376
5772
  throw new Error(`Invalid test file format: ${evalFilePath}`);
5377
5773
  }
@@ -5512,7 +5908,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
5512
5908
  const testCase = {
5513
5909
  id,
5514
5910
  suite: suiteName,
5515
- category: options?.category,
5911
+ category: suite.category ?? options?.category,
5516
5912
  conversation_id: conversationId,
5517
5913
  question,
5518
5914
  input: inputMessages,
@@ -5605,11 +6001,11 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
5605
6001
  const workspaceFilePath = import_node_path9.default.resolve(evalFileDir, raw);
5606
6002
  let content;
5607
6003
  try {
5608
- content = await (0, import_promises9.readFile)(workspaceFilePath, "utf8");
6004
+ content = await (0, import_promises10.readFile)(workspaceFilePath, "utf8");
5609
6005
  } catch {
5610
6006
  throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
5611
6007
  }
5612
- const parsed = interpolateEnv((0, import_yaml4.parse)(content), process.env);
6008
+ const parsed = interpolateEnv((0, import_yaml5.parse)(content), process.env);
5613
6009
  if (!isJsonObject(parsed)) {
5614
6010
  throw new Error(
5615
6011
  `Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
@@ -5644,14 +6040,28 @@ function parseWorkspaceConfig(raw, evalFileDir) {
5644
6040
  const explicitMode = obj.mode === "pooled" || obj.mode === "temp" || obj.mode === "static" ? obj.mode : void 0;
5645
6041
  const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
5646
6042
  const mode = explicitMode ?? (workspacePath ? "static" : void 0);
5647
- if (!template && !isolation && !repos && !hooks && !mode && !workspacePath) return void 0;
6043
+ const docker = parseDockerWorkspaceConfig(obj.docker);
6044
+ if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker)
6045
+ return void 0;
5648
6046
  return {
5649
6047
  ...template !== void 0 && { template },
5650
6048
  ...isolation !== void 0 && { isolation },
5651
6049
  ...repos !== void 0 && { repos },
5652
6050
  ...hooks !== void 0 && { hooks },
5653
6051
  ...mode !== void 0 && { mode },
5654
- ...workspacePath !== void 0 && { path: workspacePath }
6052
+ ...workspacePath !== void 0 && { path: workspacePath },
6053
+ ...docker !== void 0 && { docker }
6054
+ };
6055
+ }
6056
+ function parseDockerWorkspaceConfig(raw) {
6057
+ if (!isJsonObject(raw)) return void 0;
6058
+ const obj = raw;
6059
+ if (typeof obj.image !== "string") return void 0;
6060
+ return {
6061
+ image: obj.image,
6062
+ ...typeof obj.timeout === "number" && { timeout: obj.timeout },
6063
+ ...typeof obj.memory === "string" && { memory: obj.memory },
6064
+ ...typeof obj.cpus === "number" && { cpus: obj.cpus }
5655
6065
  };
5656
6066
  }
5657
6067
  function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
@@ -5680,7 +6090,8 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
5680
6090
  repos: caseLevel.repos ?? suiteLevel.repos,
5681
6091
  ...hasHooks && { hooks: mergedHooks },
5682
6092
  mode: caseLevel.mode ?? suiteLevel.mode,
5683
- path: caseLevel.path ?? suiteLevel.path
6093
+ path: caseLevel.path ?? suiteLevel.path,
6094
+ docker: caseLevel.docker ?? suiteLevel.docker
5684
6095
  };
5685
6096
  }
5686
6097
  function asString5(value) {
@@ -5709,7 +6120,7 @@ ${detailBlock}${ANSI_RESET8}`);
5709
6120
  init_cjs_shims();
5710
6121
  var import_node_fs2 = require("fs");
5711
6122
  var import_node_path10 = __toESM(require("path"), 1);
5712
- var import_yaml5 = require("yaml");
6123
+ var import_yaml6 = require("yaml");
5713
6124
  function codeGraderInstruction(graderName, description) {
5714
6125
  const desc = description ? ` This grader: ${description}.` : "";
5715
6126
  return `Run \`agentv eval assert ${graderName} --agent-output <agent_output> --agent-input <original_prompt>\` and check the result.${desc} The command accepts --agent-output (the agent's full response text) and --agent-input (the original user prompt). It returns JSON on stdout: {"score": 0-1, "reasoning": "..."}. A score >= 0.5 means pass (exit 0); below 0.5 means fail (exit 1).`;
@@ -5948,7 +6359,7 @@ function transpileEvalYaml(suite, source = "EVAL.yaml") {
5948
6359
  }
5949
6360
  function transpileEvalYamlFile(evalYamlPath) {
5950
6361
  const content = (0, import_node_fs2.readFileSync)(evalYamlPath, "utf8");
5951
- const parsed = (0, import_yaml5.parse)(content);
6362
+ const parsed = (0, import_yaml6.parse)(content);
5952
6363
  return transpileEvalYaml(parsed, import_node_path10.default.basename(evalYamlPath));
5953
6364
  }
5954
6365
  function getOutputFilenames(result) {
@@ -5969,11 +6380,11 @@ function getOutputFilenames(result) {
5969
6380
  // src/evaluation/file-utils.ts
5970
6381
  init_cjs_shims();
5971
6382
  var import_node_fs3 = require("fs");
5972
- var import_promises10 = require("fs/promises");
6383
+ var import_promises11 = require("fs/promises");
5973
6384
  var import_node_path11 = __toESM(require("path"), 1);
5974
6385
  async function fileExists2(filePath) {
5975
6386
  try {
5976
- await (0, import_promises10.access)(filePath, import_node_fs3.constants.F_OK);
6387
+ await (0, import_promises11.access)(filePath, import_node_fs3.constants.F_OK);
5977
6388
  return true;
5978
6389
  } catch {
5979
6390
  return false;
@@ -5983,11 +6394,11 @@ function normalizeLineEndings(content) {
5983
6394
  return content.replace(/\r\n/g, "\n");
5984
6395
  }
5985
6396
  async function readTextFile(filePath) {
5986
- const content = await (0, import_promises10.readFile)(filePath, "utf8");
6397
+ const content = await (0, import_promises11.readFile)(filePath, "utf8");
5987
6398
  return normalizeLineEndings(content);
5988
6399
  }
5989
6400
  async function readJsonFile(filePath) {
5990
- const content = await (0, import_promises10.readFile)(filePath, "utf8");
6401
+ const content = await (0, import_promises11.readFile)(filePath, "utf8");
5991
6402
  return JSON.parse(content);
5992
6403
  }
5993
6404
  async function findGitRoot(startPath) {
@@ -6508,7 +6919,7 @@ init_cjs_shims();
6508
6919
  var import_node_child_process = require("child_process");
6509
6920
  var import_node_crypto = require("crypto");
6510
6921
  var import_node_fs4 = require("fs");
6511
- var import_promises11 = require("fs/promises");
6922
+ var import_promises12 = require("fs/promises");
6512
6923
  var import_node_path13 = __toESM(require("path"), 1);
6513
6924
 
6514
6925
  // src/evaluation/providers/claude-content.ts
@@ -6840,7 +7251,7 @@ var ClaudeCliProvider = class {
6840
7251
  return void 0;
6841
7252
  }
6842
7253
  try {
6843
- await (0, import_promises11.mkdir)(logDir, { recursive: true });
7254
+ await (0, import_promises12.mkdir)(logDir, { recursive: true });
6844
7255
  } catch (error) {
6845
7256
  const message = error instanceof Error ? error.message : String(error);
6846
7257
  console.warn(`Skipping Claude CLI stream logging (could not create ${logDir}): ${message}`);
@@ -7148,7 +7559,7 @@ function tryParseJson(line) {
7148
7559
  init_cjs_shims();
7149
7560
  var import_node_crypto2 = require("crypto");
7150
7561
  var import_node_fs5 = require("fs");
7151
- var import_promises12 = require("fs/promises");
7562
+ var import_promises13 = require("fs/promises");
7152
7563
  var import_node_path14 = __toESM(require("path"), 1);
7153
7564
  var claudeSdkModule = null;
7154
7565
  async function loadClaudeSdk() {
@@ -7333,7 +7744,7 @@ var ClaudeSdkProvider = class {
7333
7744
  return void 0;
7334
7745
  }
7335
7746
  try {
7336
- await (0, import_promises12.mkdir)(logDir, { recursive: true });
7747
+ await (0, import_promises13.mkdir)(logDir, { recursive: true });
7337
7748
  } catch (error) {
7338
7749
  const message = error instanceof Error ? error.message : String(error);
7339
7750
  console.warn(`Skipping Claude stream logging (could not create ${logDir}): ${message}`);
@@ -7526,7 +7937,7 @@ function formatElapsed2(startedAt) {
7526
7937
  // src/evaluation/providers/cli.ts
7527
7938
  init_cjs_shims();
7528
7939
  var import_node_child_process2 = require("child_process");
7529
- var import_promises13 = __toESM(require("fs/promises"), 1);
7940
+ var import_promises14 = __toESM(require("fs/promises"), 1);
7530
7941
  var import_node_os = __toESM(require("os"), 1);
7531
7942
  var import_node_path15 = __toESM(require("path"), 1);
7532
7943
  var import_node_util = require("util");
@@ -7925,7 +8336,7 @@ var CliProvider = class {
7925
8336
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
7926
8337
  } finally {
7927
8338
  if (!this.keepTempFiles) {
7928
- await import_promises13.default.unlink(filePath).catch(() => {
8339
+ await import_promises14.default.unlink(filePath).catch(() => {
7929
8340
  });
7930
8341
  }
7931
8342
  }
@@ -8005,7 +8416,7 @@ var CliProvider = class {
8005
8416
  async function buildTemplateValues(request, config, outputFilePath) {
8006
8417
  const inputFiles = normalizeInputFiles2(request.inputFiles);
8007
8418
  const promptFilePath = generateOutputFilePath(request.evalCaseId, ".prompt.txt");
8008
- await import_promises13.default.writeFile(promptFilePath, request.question ?? "", "utf8");
8419
+ await import_promises14.default.writeFile(promptFilePath, request.question ?? "", "utf8");
8009
8420
  return {
8010
8421
  values: {
8011
8422
  PROMPT: shellEscape(request.question ?? ""),
@@ -8022,7 +8433,7 @@ async function cleanupTempFile(filePath, keepTempFiles) {
8022
8433
  if (!filePath || keepTempFiles) {
8023
8434
  return;
8024
8435
  }
8025
- await import_promises13.default.unlink(filePath).catch(() => {
8436
+ await import_promises14.default.unlink(filePath).catch(() => {
8026
8437
  });
8027
8438
  }
8028
8439
  function normalizeInputFiles2(inputFiles) {
@@ -8083,7 +8494,7 @@ function formatTimeoutSuffix2(timeoutMs) {
8083
8494
  init_cjs_shims();
8084
8495
  var import_node_crypto3 = require("crypto");
8085
8496
  var import_node_fs6 = require("fs");
8086
- var import_promises14 = require("fs/promises");
8497
+ var import_promises15 = require("fs/promises");
8087
8498
  var import_node_path16 = __toESM(require("path"), 1);
8088
8499
 
8089
8500
  // src/evaluation/providers/codex-log-tracker.ts
@@ -8343,7 +8754,7 @@ ${basePrompt}` : basePrompt;
8343
8754
  return void 0;
8344
8755
  }
8345
8756
  try {
8346
- await (0, import_promises14.mkdir)(logDir, { recursive: true });
8757
+ await (0, import_promises15.mkdir)(logDir, { recursive: true });
8347
8758
  } catch (error) {
8348
8759
  const message = error instanceof Error ? error.message : String(error);
8349
8760
  console.warn(`Skipping Codex SDK stream logging (could not create ${logDir}): ${message}`);
@@ -8493,7 +8904,7 @@ function formatElapsed3(startedAt) {
8493
8904
  // src/evaluation/providers/copilot-cli.ts
8494
8905
  init_cjs_shims();
8495
8906
  var import_node_crypto5 = require("crypto");
8496
- var import_promises15 = require("fs/promises");
8907
+ var import_promises16 = require("fs/promises");
8497
8908
  var import_node_path18 = __toESM(require("path"), 1);
8498
8909
  var import_node_stream = require("stream");
8499
8910
  var import_node_child_process3 = require("child_process");
@@ -8562,7 +8973,7 @@ var import_node_path17 = __toESM(require("path"), 1);
8562
8973
  var import_node_url3 = require("url");
8563
8974
  var import_meta = {};
8564
8975
  function resolvePlatformCliPath() {
8565
- const os3 = (0, import_node_os2.platform)();
8976
+ const os4 = (0, import_node_os2.platform)();
8566
8977
  const cpu = (0, import_node_os2.arch)();
8567
8978
  const platformMap = {
8568
8979
  linux: "linux",
@@ -8573,13 +8984,13 @@ function resolvePlatformCliPath() {
8573
8984
  x64: "x64",
8574
8985
  arm64: "arm64"
8575
8986
  };
8576
- const osPart = platformMap[os3];
8987
+ const osPart = platformMap[os4];
8577
8988
  const archPart = archMap[cpu];
8578
8989
  if (!osPart || !archPart) {
8579
8990
  return void 0;
8580
8991
  }
8581
8992
  const packageName = `@github/copilot-${osPart}-${archPart}`;
8582
- const binaryName = os3 === "win32" ? "copilot.exe" : "copilot";
8993
+ const binaryName = os4 === "win32" ? "copilot.exe" : "copilot";
8583
8994
  try {
8584
8995
  const resolved = import_meta.resolve(`${packageName}/package.json`);
8585
8996
  const packageJsonPath = resolved.startsWith("file:") ? (0, import_node_url3.fileURLToPath)(resolved) : resolved;
@@ -8997,7 +9408,7 @@ var CopilotCliProvider = class {
8997
9408
  return void 0;
8998
9409
  }
8999
9410
  try {
9000
- await (0, import_promises15.mkdir)(logDir, { recursive: true });
9411
+ await (0, import_promises16.mkdir)(logDir, { recursive: true });
9001
9412
  } catch (error) {
9002
9413
  const message = error instanceof Error ? error.message : String(error);
9003
9414
  console.warn(`Skipping Copilot CLI stream logging (could not create ${logDir}): ${message}`);
@@ -9097,7 +9508,7 @@ function summarizeAcpEvent(eventType, data) {
9097
9508
 
9098
9509
  // src/evaluation/providers/copilot-log.ts
9099
9510
  init_cjs_shims();
9100
- var import_promises17 = require("fs/promises");
9511
+ var import_promises18 = require("fs/promises");
9101
9512
  var import_node_os4 = require("os");
9102
9513
  var import_node_path20 = __toESM(require("path"), 1);
9103
9514
 
@@ -9233,17 +9644,17 @@ function parseCopilotEvents(eventsJsonl) {
9233
9644
 
9234
9645
  // src/evaluation/providers/copilot-session-discovery.ts
9235
9646
  init_cjs_shims();
9236
- var import_promises16 = require("fs/promises");
9647
+ var import_promises17 = require("fs/promises");
9237
9648
  var import_node_os3 = require("os");
9238
9649
  var import_node_path19 = __toESM(require("path"), 1);
9239
- var import_yaml6 = require("yaml");
9650
+ var import_yaml7 = require("yaml");
9240
9651
  var DEFAULT_SESSION_STATE_DIR = () => import_node_path19.default.join((0, import_node_os3.homedir)(), ".copilot", "session-state");
9241
9652
  async function discoverCopilotSessions(opts) {
9242
9653
  const sessionStateDir = opts?.sessionStateDir ?? DEFAULT_SESSION_STATE_DIR();
9243
9654
  const limit = opts?.limit ?? 10;
9244
9655
  let entries;
9245
9656
  try {
9246
- entries = await (0, import_promises16.readdir)(sessionStateDir);
9657
+ entries = await (0, import_promises17.readdir)(sessionStateDir);
9247
9658
  } catch {
9248
9659
  return [];
9249
9660
  }
@@ -9253,12 +9664,12 @@ async function discoverCopilotSessions(opts) {
9253
9664
  const workspacePath = import_node_path19.default.join(sessionDir, "workspace.yaml");
9254
9665
  const eventsPath = import_node_path19.default.join(sessionDir, "events.jsonl");
9255
9666
  try {
9256
- const workspaceContent = await (0, import_promises16.readFile)(workspacePath, "utf8");
9257
- const workspace = (0, import_yaml6.parse)(workspaceContent) ?? {};
9667
+ const workspaceContent = await (0, import_promises17.readFile)(workspacePath, "utf8");
9668
+ const workspace = (0, import_yaml7.parse)(workspaceContent) ?? {};
9258
9669
  const cwd = String(workspace.cwd ?? "");
9259
9670
  let updatedAt;
9260
9671
  try {
9261
- const eventsStat = await (0, import_promises16.stat)(eventsPath);
9672
+ const eventsStat = await (0, import_promises17.stat)(eventsPath);
9262
9673
  updatedAt = eventsStat.mtime;
9263
9674
  } catch {
9264
9675
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -9315,7 +9726,7 @@ var CopilotLogProvider = class {
9315
9726
  const eventsPath = import_node_path20.default.join(sessionDir, "events.jsonl");
9316
9727
  let eventsContent;
9317
9728
  try {
9318
- eventsContent = await (0, import_promises17.readFile)(eventsPath, "utf8");
9729
+ eventsContent = await (0, import_promises18.readFile)(eventsPath, "utf8");
9319
9730
  } catch (err) {
9320
9731
  throw new Error(
9321
9732
  `Failed to read Copilot session transcript at ${eventsPath}: ${err instanceof Error ? err.message : String(err)}`
@@ -9360,7 +9771,7 @@ var CopilotLogProvider = class {
9360
9771
  init_cjs_shims();
9361
9772
  var import_node_crypto6 = require("crypto");
9362
9773
  var import_node_fs8 = require("fs");
9363
- var import_promises18 = require("fs/promises");
9774
+ var import_promises19 = require("fs/promises");
9364
9775
  var import_node_path21 = __toESM(require("path"), 1);
9365
9776
 
9366
9777
  // src/evaluation/providers/copilot-sdk-log-tracker.ts
@@ -9694,7 +10105,7 @@ var CopilotSdkProvider = class {
9694
10105
  return void 0;
9695
10106
  }
9696
10107
  try {
9697
- await (0, import_promises18.mkdir)(logDir, { recursive: true });
10108
+ await (0, import_promises19.mkdir)(logDir, { recursive: true });
9698
10109
  } catch (error) {
9699
10110
  const message = error instanceof Error ? error.message : String(error);
9700
10111
  console.warn(`Skipping Copilot SDK stream logging (could not create ${logDir}): ${message}`);
@@ -9815,7 +10226,7 @@ init_cjs_shims();
9815
10226
  var import_node_child_process4 = require("child_process");
9816
10227
  var import_node_crypto7 = require("crypto");
9817
10228
  var import_node_fs9 = require("fs");
9818
- var import_promises19 = require("fs/promises");
10229
+ var import_promises20 = require("fs/promises");
9819
10230
  var import_node_os5 = require("os");
9820
10231
  var import_node_path22 = __toESM(require("path"), 1);
9821
10232
 
@@ -10027,7 +10438,7 @@ var PiCliProvider = class {
10027
10438
  const logger = await this.createStreamLogger(request).catch(() => void 0);
10028
10439
  try {
10029
10440
  const promptFile = import_node_path22.default.join(cwd, PROMPT_FILENAME);
10030
- await (0, import_promises19.writeFile)(promptFile, request.question, "utf8");
10441
+ await (0, import_promises20.writeFile)(promptFile, request.question, "utf8");
10031
10442
  const args = this.buildPiArgs(request.question, inputFiles);
10032
10443
  const result = await this.executePi(args, cwd, request.signal, logger);
10033
10444
  if (result.timedOut) {
@@ -10198,11 +10609,11 @@ ${prompt}` : prompt;
10198
10609
  return env;
10199
10610
  }
10200
10611
  async createWorkspace() {
10201
- return await (0, import_promises19.mkdtemp)(import_node_path22.default.join((0, import_node_os5.tmpdir)(), WORKSPACE_PREFIX));
10612
+ return await (0, import_promises20.mkdtemp)(import_node_path22.default.join((0, import_node_os5.tmpdir)(), WORKSPACE_PREFIX));
10202
10613
  }
10203
10614
  async cleanupWorkspace(workspaceRoot) {
10204
10615
  try {
10205
- await (0, import_promises19.rm)(workspaceRoot, { recursive: true, force: true });
10616
+ await (0, import_promises20.rm)(workspaceRoot, { recursive: true, force: true });
10206
10617
  } catch {
10207
10618
  }
10208
10619
  }
@@ -10218,7 +10629,7 @@ ${prompt}` : prompt;
10218
10629
  return void 0;
10219
10630
  }
10220
10631
  try {
10221
- await (0, import_promises19.mkdir)(logDir, { recursive: true });
10632
+ await (0, import_promises20.mkdir)(logDir, { recursive: true });
10222
10633
  } catch (error) {
10223
10634
  const message = error instanceof Error ? error.message : String(error);
10224
10635
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
@@ -10775,7 +11186,7 @@ init_cjs_shims();
10775
11186
  var import_node_child_process5 = require("child_process");
10776
11187
  var import_node_crypto8 = require("crypto");
10777
11188
  var import_node_fs10 = require("fs");
10778
- var import_promises20 = require("fs/promises");
11189
+ var import_promises21 = require("fs/promises");
10779
11190
  var import_node_path24 = __toESM(require("path"), 1);
10780
11191
  var import_node_readline = require("readline");
10781
11192
  var import_node_url4 = require("url");
@@ -11236,7 +11647,7 @@ ${fileList}`;
11236
11647
  return void 0;
11237
11648
  }
11238
11649
  try {
11239
- await (0, import_promises20.mkdir)(logDir, { recursive: true });
11650
+ await (0, import_promises21.mkdir)(logDir, { recursive: true });
11240
11651
  } catch (error) {
11241
11652
  const message = error instanceof Error ? error.message : String(error);
11242
11653
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
@@ -12675,8 +13086,8 @@ function resolveCliConfig(target, env, evalFilePath) {
12675
13086
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
12676
13087
  if (!parseResult.success) {
12677
13088
  const firstError = parseResult.error.errors[0];
12678
- const path55 = firstError?.path.join(".") || "";
12679
- const prefix = path55 ? `${target.name} ${path55}: ` : `${target.name}: `;
13089
+ const path56 = firstError?.path.join(".") || "";
13090
+ const prefix = path56 ? `${target.name} ${path56}: ` : `${target.name}: `;
12680
13091
  throw new Error(`${prefix}${firstError?.message}`);
12681
13092
  }
12682
13093
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -12923,7 +13334,7 @@ function resolveOptionalNumberArray(source, description) {
12923
13334
  // src/evaluation/providers/vscode-provider.ts
12924
13335
  init_cjs_shims();
12925
13336
  var import_node_child_process7 = require("child_process");
12926
- var import_promises27 = require("fs/promises");
13337
+ var import_promises28 = require("fs/promises");
12927
13338
  var import_node_path36 = __toESM(require("path"), 1);
12928
13339
  var import_node_util3 = require("util");
12929
13340
 
@@ -12932,27 +13343,27 @@ init_cjs_shims();
12932
13343
 
12933
13344
  // src/evaluation/providers/vscode/dispatch/agentDispatch.ts
12934
13345
  init_cjs_shims();
12935
- var import_promises25 = require("fs/promises");
13346
+ var import_promises26 = require("fs/promises");
12936
13347
  var import_node_path34 = __toESM(require("path"), 1);
12937
13348
 
12938
13349
  // src/evaluation/providers/vscode/utils/fs.ts
12939
13350
  init_cjs_shims();
12940
13351
  var import_node_fs11 = require("fs");
12941
- var import_promises21 = require("fs/promises");
13352
+ var import_promises22 = require("fs/promises");
12942
13353
  var import_node_path26 = __toESM(require("path"), 1);
12943
13354
  async function pathExists(target) {
12944
13355
  try {
12945
- await (0, import_promises21.access)(target, import_node_fs11.constants.F_OK);
13356
+ await (0, import_promises22.access)(target, import_node_fs11.constants.F_OK);
12946
13357
  return true;
12947
13358
  } catch {
12948
13359
  return false;
12949
13360
  }
12950
13361
  }
12951
13362
  async function ensureDir(target) {
12952
- await (0, import_promises21.mkdir)(target, { recursive: true });
13363
+ await (0, import_promises22.mkdir)(target, { recursive: true });
12953
13364
  }
12954
13365
  async function readDirEntries(target) {
12955
- const entries = await (0, import_promises21.readdir)(target, { withFileTypes: true });
13366
+ const entries = await (0, import_promises22.readdir)(target, { withFileTypes: true });
12956
13367
  return entries.map((entry) => ({
12957
13368
  name: entry.name,
12958
13369
  absolutePath: import_node_path26.default.join(target, entry.name),
@@ -12961,7 +13372,7 @@ async function readDirEntries(target) {
12961
13372
  }
12962
13373
  async function removeIfExists(target) {
12963
13374
  try {
12964
- await (0, import_promises21.rm)(target, { force: true, recursive: false });
13375
+ await (0, import_promises22.rm)(target, { force: true, recursive: false });
12965
13376
  } catch (error) {
12966
13377
  if (error.code !== "ENOENT") {
12967
13378
  throw error;
@@ -13087,7 +13498,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
13087
13498
 
13088
13499
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
13089
13500
  init_cjs_shims();
13090
- var import_promises22 = require("fs/promises");
13501
+ var import_promises23 = require("fs/promises");
13091
13502
  var import_node_path29 = __toESM(require("path"), 1);
13092
13503
 
13093
13504
  // src/evaluation/providers/vscode/utils/time.ts
@@ -13127,7 +13538,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
13127
13538
  const maxAttempts = 10;
13128
13539
  while (attempts < maxAttempts) {
13129
13540
  try {
13130
- const content = await (0, import_promises22.readFile)(responseFileFinal, { encoding: "utf8" });
13541
+ const content = await (0, import_promises23.readFile)(responseFileFinal, { encoding: "utf8" });
13131
13542
  if (!silent) {
13132
13543
  process.stdout.write(`${content}
13133
13544
  `);
@@ -13184,7 +13595,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
13184
13595
  const maxAttempts = 10;
13185
13596
  while (attempts < maxAttempts) {
13186
13597
  try {
13187
- const content = await (0, import_promises22.readFile)(file, { encoding: "utf8" });
13598
+ const content = await (0, import_promises23.readFile)(file, { encoding: "utf8" });
13188
13599
  if (!silent) {
13189
13600
  process.stdout.write(`${content}
13190
13601
  `);
@@ -13208,7 +13619,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
13208
13619
  // src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
13209
13620
  init_cjs_shims();
13210
13621
  var import_node_child_process6 = require("child_process");
13211
- var import_promises23 = require("fs/promises");
13622
+ var import_promises24 = require("fs/promises");
13212
13623
  var import_node_path31 = __toESM(require("path"), 1);
13213
13624
  var import_node_util2 = require("util");
13214
13625
 
@@ -13289,9 +13700,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
13289
13700
  const aliveFile = import_node_path31.default.join(subagentDir, DEFAULT_ALIVE_FILENAME);
13290
13701
  await removeIfExists(aliveFile);
13291
13702
  const githubAgentsDir = import_node_path31.default.join(subagentDir, ".github", "agents");
13292
- await (0, import_promises23.mkdir)(githubAgentsDir, { recursive: true });
13703
+ await (0, import_promises24.mkdir)(githubAgentsDir, { recursive: true });
13293
13704
  const wakeupDst = import_node_path31.default.join(githubAgentsDir, "wakeup.md");
13294
- await (0, import_promises23.writeFile)(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
13705
+ await (0, import_promises24.writeFile)(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
13295
13706
  const workspaceChild = spawnVsCode(vscodeCmd, [workspacePath], {
13296
13707
  label: "open-workspace"
13297
13708
  });
@@ -13320,9 +13731,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
13320
13731
  async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, requestInstructions, timestamp, vscodeCmd) {
13321
13732
  const workspacePath = import_node_path31.default.join(subagentDir, `${import_node_path31.default.basename(subagentDir)}.code-workspace`);
13322
13733
  const messagesDir = import_node_path31.default.join(subagentDir, "messages");
13323
- await (0, import_promises23.mkdir)(messagesDir, { recursive: true });
13734
+ await (0, import_promises24.mkdir)(messagesDir, { recursive: true });
13324
13735
  const reqFile = import_node_path31.default.join(messagesDir, `${timestamp}_req.md`);
13325
- await (0, import_promises23.writeFile)(reqFile, requestInstructions, { encoding: "utf8" });
13736
+ await (0, import_promises24.writeFile)(reqFile, requestInstructions, { encoding: "utf8" });
13326
13737
  const reqUri = pathToFileUri2(reqFile);
13327
13738
  const chatArgs = ["-r", "chat", "-m", chatId];
13328
13739
  for (const attachment of attachmentPaths) {
@@ -13348,7 +13759,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
13348
13759
  async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, chatInstruction, vscodeCmd) {
13349
13760
  const workspacePath = import_node_path31.default.join(subagentDir, `${import_node_path31.default.basename(subagentDir)}.code-workspace`);
13350
13761
  const messagesDir = import_node_path31.default.join(subagentDir, "messages");
13351
- await (0, import_promises23.mkdir)(messagesDir, { recursive: true });
13762
+ await (0, import_promises24.mkdir)(messagesDir, { recursive: true });
13352
13763
  const chatArgs = ["-r", "chat", "-m", chatId];
13353
13764
  for (const attachment of attachmentPaths) {
13354
13765
  chatArgs.push("-a", attachment);
@@ -13372,7 +13783,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
13372
13783
 
13373
13784
  // src/evaluation/providers/vscode/dispatch/workspaceManager.ts
13374
13785
  init_cjs_shims();
13375
- var import_promises24 = require("fs/promises");
13786
+ var import_promises25 = require("fs/promises");
13376
13787
  var import_node_path33 = __toESM(require("path"), 1);
13377
13788
 
13378
13789
  // src/evaluation/providers/vscode/utils/workspace.ts
@@ -13486,11 +13897,11 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
13486
13897
  if (!await pathExists(workspaceSrc)) {
13487
13898
  throw new Error(`workspace template not found: ${workspaceSrc}`);
13488
13899
  }
13489
- const stats = await (0, import_promises24.stat)(workspaceSrc);
13900
+ const stats = await (0, import_promises25.stat)(workspaceSrc);
13490
13901
  if (!stats.isFile()) {
13491
13902
  throw new Error(`workspace template must be a file, not a directory: ${workspaceSrc}`);
13492
13903
  }
13493
- const templateText = await (0, import_promises24.readFile)(workspaceSrc, "utf8");
13904
+ const templateText = await (0, import_promises25.readFile)(workspaceSrc, "utf8");
13494
13905
  workspaceContent = JSON.parse(templateText);
13495
13906
  } else {
13496
13907
  workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
@@ -13509,15 +13920,15 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
13509
13920
  transformedContent = JSON.stringify(parsed, null, 2);
13510
13921
  }
13511
13922
  }
13512
- await (0, import_promises24.writeFile)(workspaceDst, transformedContent, "utf8");
13923
+ await (0, import_promises25.writeFile)(workspaceDst, transformedContent, "utf8");
13513
13924
  const messagesDir = import_node_path33.default.join(subagentDir, "messages");
13514
- await (0, import_promises24.mkdir)(messagesDir, { recursive: true });
13925
+ await (0, import_promises25.mkdir)(messagesDir, { recursive: true });
13515
13926
  return { workspace: workspaceDst, messagesDir };
13516
13927
  }
13517
13928
  async function createSubagentLock(subagentDir) {
13518
13929
  const messagesDir = import_node_path33.default.join(subagentDir, "messages");
13519
13930
  if (await pathExists(messagesDir)) {
13520
- const files = await (0, import_promises24.readdir)(messagesDir);
13931
+ const files = await (0, import_promises25.readdir)(messagesDir);
13521
13932
  await Promise.all(
13522
13933
  files.map(async (file) => {
13523
13934
  const target = import_node_path33.default.join(messagesDir, file);
@@ -13527,14 +13938,14 @@ async function createSubagentLock(subagentDir) {
13527
13938
  }
13528
13939
  const githubAgentsDir = import_node_path33.default.join(subagentDir, ".github", "agents");
13529
13940
  if (await pathExists(githubAgentsDir)) {
13530
- const agentFiles = await (0, import_promises24.readdir)(githubAgentsDir);
13941
+ const agentFiles = await (0, import_promises25.readdir)(githubAgentsDir);
13531
13942
  const preservedFiles = /* @__PURE__ */ new Set(["wakeup.md", "subagent.md"]);
13532
13943
  await Promise.all(
13533
13944
  agentFiles.filter((file) => file.endsWith(".md") && !preservedFiles.has(file)).map((file) => removeIfExists(import_node_path33.default.join(githubAgentsDir, file)))
13534
13945
  );
13535
13946
  }
13536
13947
  const lockFile = import_node_path33.default.join(subagentDir, DEFAULT_LOCK_NAME);
13537
- await (0, import_promises24.writeFile)(lockFile, "", { encoding: "utf8" });
13948
+ await (0, import_promises25.writeFile)(lockFile, "", { encoding: "utf8" });
13538
13949
  return lockFile;
13539
13950
  }
13540
13951
  async function removeSubagentLock(subagentDir) {
@@ -13559,10 +13970,10 @@ async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspa
13559
13970
  }
13560
13971
  if (promptFile) {
13561
13972
  const githubAgentsDir = import_node_path33.default.join(subagentDir, ".github", "agents");
13562
- await (0, import_promises24.mkdir)(githubAgentsDir, { recursive: true });
13973
+ await (0, import_promises25.mkdir)(githubAgentsDir, { recursive: true });
13563
13974
  const agentFile = import_node_path33.default.join(githubAgentsDir, `${chatId}.md`);
13564
13975
  try {
13565
- await (0, import_promises24.copyFile)(promptFile, agentFile);
13976
+ await (0, import_promises25.copyFile)(promptFile, agentFile);
13566
13977
  } catch (error) {
13567
13978
  console.error(`error: Failed to copy prompt file to agent mode: ${error.message}`);
13568
13979
  return 1;
@@ -13583,7 +13994,7 @@ async function resolvePromptFile(promptFile) {
13583
13994
  if (!await pathExists(resolvedPrompt)) {
13584
13995
  throw new Error(`Prompt file not found: ${resolvedPrompt}`);
13585
13996
  }
13586
- const promptStats = await (0, import_promises25.stat)(resolvedPrompt);
13997
+ const promptStats = await (0, import_promises26.stat)(resolvedPrompt);
13587
13998
  if (!promptStats.isFile()) {
13588
13999
  throw new Error(`Prompt file must be a file, not a directory: ${resolvedPrompt}`);
13589
14000
  }
@@ -13820,7 +14231,7 @@ async function dispatchBatchAgent(options) {
13820
14231
  const reqFile = requestFiles[index];
13821
14232
  const tmpFile = responseTmpFiles[index];
13822
14233
  const finalFile = responseFilesFinal[index];
13823
- return (0, import_promises25.writeFile)(
14234
+ return (0, import_promises26.writeFile)(
13824
14235
  reqFile,
13825
14236
  createBatchRequestPrompt(query, tmpFile, finalFile, batchRequestTemplateContent),
13826
14237
  { encoding: "utf8" }
@@ -13832,7 +14243,7 @@ async function dispatchBatchAgent(options) {
13832
14243
  responseFilesFinal,
13833
14244
  orchestratorTemplateContent
13834
14245
  );
13835
- await (0, import_promises25.writeFile)(orchestratorFile, orchestratorContent, { encoding: "utf8" });
14246
+ await (0, import_promises26.writeFile)(orchestratorFile, orchestratorContent, { encoding: "utf8" });
13836
14247
  }
13837
14248
  const chatAttachments = [orchestratorFile, ...attachments];
13838
14249
  const orchestratorUri = pathToFileUri2(orchestratorFile);
@@ -13899,7 +14310,7 @@ async function dispatchBatchAgent(options) {
13899
14310
 
13900
14311
  // src/evaluation/providers/vscode/dispatch/provision.ts
13901
14312
  init_cjs_shims();
13902
- var import_promises26 = require("fs/promises");
14313
+ var import_promises27 = require("fs/promises");
13903
14314
  var import_node_path35 = __toESM(require("path"), 1);
13904
14315
  var DEFAULT_WORKSPACE_TEMPLATE2 = {
13905
14316
  folders: [
@@ -13980,8 +14391,8 @@ async function provisionSubagents(options) {
13980
14391
  if (!dryRun) {
13981
14392
  await removeIfExists(lockFile);
13982
14393
  await ensureDir(githubAgentsDir);
13983
- await (0, import_promises26.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
13984
- await (0, import_promises26.writeFile)(wakeupDst, wakeupContent, "utf8");
14394
+ await (0, import_promises27.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
14395
+ await (0, import_promises27.writeFile)(wakeupDst, wakeupContent, "utf8");
13985
14396
  }
13986
14397
  created.push(subagentDir);
13987
14398
  lockedSubagents.delete(subagentDir);
@@ -13991,8 +14402,8 @@ async function provisionSubagents(options) {
13991
14402
  if (!isLocked && force) {
13992
14403
  if (!dryRun) {
13993
14404
  await ensureDir(githubAgentsDir);
13994
- await (0, import_promises26.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
13995
- await (0, import_promises26.writeFile)(wakeupDst, wakeupContent, "utf8");
14405
+ await (0, import_promises27.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
14406
+ await (0, import_promises27.writeFile)(wakeupDst, wakeupContent, "utf8");
13996
14407
  }
13997
14408
  created.push(subagentDir);
13998
14409
  subagentsProvisioned += 1;
@@ -14000,8 +14411,8 @@ async function provisionSubagents(options) {
14000
14411
  }
14001
14412
  if (!dryRun && !await pathExists(workspaceDst)) {
14002
14413
  await ensureDir(githubAgentsDir);
14003
- await (0, import_promises26.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
14004
- await (0, import_promises26.writeFile)(wakeupDst, wakeupContent, "utf8");
14414
+ await (0, import_promises27.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
14415
+ await (0, import_promises27.writeFile)(wakeupDst, wakeupContent, "utf8");
14005
14416
  }
14006
14417
  skippedExisting.push(subagentDir);
14007
14418
  subagentsProvisioned += 1;
@@ -14016,8 +14427,8 @@ async function provisionSubagents(options) {
14016
14427
  if (!dryRun) {
14017
14428
  await ensureDir(subagentDir);
14018
14429
  await ensureDir(githubAgentsDir);
14019
- await (0, import_promises26.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
14020
- await (0, import_promises26.writeFile)(wakeupDst, wakeupContent, "utf8");
14430
+ await (0, import_promises27.writeFile)(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
14431
+ await (0, import_promises27.writeFile)(wakeupDst, wakeupContent, "utf8");
14021
14432
  }
14022
14433
  created.push(subagentDir);
14023
14434
  subagentsProvisioned += 1;
@@ -14205,7 +14616,7 @@ async function locateVSCodeExecutable(candidate) {
14205
14616
  if (includesPathSeparator) {
14206
14617
  const resolved = import_node_path36.default.isAbsolute(candidate) ? candidate : import_node_path36.default.resolve(candidate);
14207
14618
  try {
14208
- await (0, import_promises27.access)(resolved, import_promises27.constants.F_OK);
14619
+ await (0, import_promises28.access)(resolved, import_promises28.constants.F_OK);
14209
14620
  return resolved;
14210
14621
  } catch {
14211
14622
  throw new Error(
@@ -14218,7 +14629,7 @@ async function locateVSCodeExecutable(candidate) {
14218
14629
  const { stdout } = await execAsync3(`${locator} ${candidate}`);
14219
14630
  const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
14220
14631
  if (lines.length > 0 && lines[0]) {
14221
- await (0, import_promises27.access)(lines[0], import_promises27.constants.F_OK);
14632
+ await (0, import_promises28.access)(lines[0], import_promises28.constants.F_OK);
14222
14633
  return lines[0];
14223
14634
  }
14224
14635
  } catch {
@@ -14232,7 +14643,7 @@ async function resolveWorkspaceTemplateFile(template) {
14232
14643
  return void 0;
14233
14644
  }
14234
14645
  try {
14235
- const stats = await (0, import_promises27.stat)(import_node_path36.default.resolve(template));
14646
+ const stats = await (0, import_promises28.stat)(import_node_path36.default.resolve(template));
14236
14647
  return stats.isFile() ? template : void 0;
14237
14648
  } catch {
14238
14649
  return template;
@@ -14401,9 +14812,9 @@ function isAgentProvider(provider) {
14401
14812
  // src/evaluation/providers/targets-file.ts
14402
14813
  init_cjs_shims();
14403
14814
  var import_node_fs12 = require("fs");
14404
- var import_promises28 = require("fs/promises");
14815
+ var import_promises29 = require("fs/promises");
14405
14816
  var import_node_path37 = __toESM(require("path"), 1);
14406
- var import_yaml7 = require("yaml");
14817
+ var import_yaml8 = require("yaml");
14407
14818
  function isRecord(value) {
14408
14819
  return typeof value === "object" && value !== null && !Array.isArray(value);
14409
14820
  }
@@ -14435,7 +14846,7 @@ function assertTargetDefinition(value, index, filePath) {
14435
14846
  }
14436
14847
  async function fileExists3(filePath) {
14437
14848
  try {
14438
- await (0, import_promises28.access)(filePath, import_node_fs12.constants.F_OK);
14849
+ await (0, import_promises29.access)(filePath, import_node_fs12.constants.F_OK);
14439
14850
  return true;
14440
14851
  } catch {
14441
14852
  return false;
@@ -14446,8 +14857,8 @@ async function readTargetDefinitions(filePath) {
14446
14857
  if (!await fileExists3(absolutePath)) {
14447
14858
  throw new Error(`targets.yaml not found at ${absolutePath}`);
14448
14859
  }
14449
- const raw = await (0, import_promises28.readFile)(absolutePath, "utf8");
14450
- const parsed = (0, import_yaml7.parse)(raw);
14860
+ const raw = await (0, import_promises29.readFile)(absolutePath, "utf8");
14861
+ const parsed = (0, import_yaml8.parse)(raw);
14451
14862
  if (!isRecord(parsed)) {
14452
14863
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
14453
14864
  }
@@ -14618,9 +15029,10 @@ function negateScore(score) {
14618
15029
 
14619
15030
  // src/evaluation/evaluators/code-evaluator.ts
14620
15031
  init_cjs_shims();
14621
- var import_promises29 = require("fs/promises");
15032
+ var import_promises30 = require("fs/promises");
14622
15033
  var import_node_os7 = require("os");
14623
15034
  var import_node_path39 = require("path");
15035
+ init_exec();
14624
15036
 
14625
15037
  // src/runtime/target-proxy.ts
14626
15038
  init_cjs_shims();
@@ -14900,6 +15312,19 @@ function toCamelCaseDeep(obj) {
14900
15312
  return obj;
14901
15313
  }
14902
15314
 
15315
+ // src/evaluation/workspace/repo-checkout.ts
15316
+ init_cjs_shims();
15317
+ function getRepoCheckoutRef(checkout) {
15318
+ return checkout?.base_commit ?? checkout?.ref ?? "HEAD";
15319
+ }
15320
+ function getRepoCheckoutTargets(repos) {
15321
+ if (!repos) return [];
15322
+ return repos.filter((repo) => repo.checkout?.base_commit || repo.checkout?.ref).map((repo) => ({
15323
+ path: repo.path,
15324
+ ref: getRepoCheckoutRef(repo.checkout)
15325
+ }));
15326
+ }
15327
+
14903
15328
  // src/evaluation/evaluators/code-evaluator.ts
14904
15329
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
14905
15330
  var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
@@ -14942,7 +15367,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
14942
15367
  const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
14943
15368
  const dir = await getWorkDir();
14944
15369
  const filePath = (0, import_node_path39.join)(dir, `img-${counter++}.${ext}`);
14945
- await (0, import_promises29.writeFile)(filePath, Buffer.from(base64Data, "base64"));
15370
+ await (0, import_promises30.writeFile)(filePath, Buffer.from(base64Data, "base64"));
14946
15371
  blocks.push({ type: "image", media_type: img.media_type, path: filePath });
14947
15372
  } else {
14948
15373
  blocks.push({ type: "image", media_type: img.media_type, path: img.source });
@@ -14970,7 +15395,7 @@ var CodeEvaluator = class {
14970
15395
  let imageTmpDir;
14971
15396
  const getImageDir = async () => {
14972
15397
  if (!imageTmpDir) {
14973
- imageTmpDir = await (0, import_promises29.mkdtemp)((0, import_node_path39.join)((0, import_node_os7.tmpdir)(), "agentv-img-"));
15398
+ imageTmpDir = await (0, import_promises30.mkdtemp)((0, import_node_path39.join)((0, import_node_os7.tmpdir)(), "agentv-img-"));
14974
15399
  }
14975
15400
  return imageTmpDir;
14976
15401
  };
@@ -14983,9 +15408,9 @@ var CodeEvaluator = class {
14983
15408
  if (outputForPayload) {
14984
15409
  const serialized = JSON.stringify(outputForPayload);
14985
15410
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
14986
- const tmpDir = await (0, import_promises29.mkdtemp)((0, import_node_path39.join)((0, import_node_os7.tmpdir)(), "agentv-grader-"));
15411
+ const tmpDir = await (0, import_promises30.mkdtemp)((0, import_node_path39.join)((0, import_node_os7.tmpdir)(), "agentv-grader-"));
14987
15412
  outputPath = (0, import_node_path39.join)(tmpDir, "output.json");
14988
- await (0, import_promises29.writeFile)(outputPath, serialized);
15413
+ await (0, import_promises30.writeFile)(outputPath, serialized);
14989
15414
  outputForPayload = null;
14990
15415
  }
14991
15416
  }
@@ -15034,13 +15459,31 @@ var CodeEvaluator = class {
15034
15459
  const workspaceEnv = context2.workspacePath ? { AGENTV_WORKSPACE_PATH: context2.workspacePath } : void 0;
15035
15460
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
15036
15461
  try {
15037
- const stdout = await executeScript(
15038
- this.command,
15039
- inputPayload,
15040
- this.agentTimeoutMs,
15041
- this.cwd,
15042
- env
15043
- );
15462
+ let stdout;
15463
+ if (context2.dockerConfig) {
15464
+ const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await Promise.resolve().then(() => (init_docker_workspace(), docker_workspace_exports));
15465
+ const dockerProvider = new DockerWorkspaceProvider2(context2.dockerConfig);
15466
+ const result = await dockerProvider.runGraderInContainer({
15467
+ command: [...this.command],
15468
+ stdin: inputPayload,
15469
+ repoCheckouts: getRepoCheckoutTargets(context2.evalCase.workspace?.repos)
15470
+ });
15471
+ if (result.exitCode !== 0) {
15472
+ const trimmedErr = result.stderr.trim();
15473
+ throw new Error(
15474
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
15475
+ );
15476
+ }
15477
+ stdout = result.stdout.trim();
15478
+ } else {
15479
+ stdout = await executeScript(
15480
+ this.command,
15481
+ inputPayload,
15482
+ this.agentTimeoutMs,
15483
+ this.cwd,
15484
+ env
15485
+ );
15486
+ }
15044
15487
  const parsed = parseJsonSafe(stdout);
15045
15488
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
15046
15489
  const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
@@ -15097,11 +15540,11 @@ var CodeEvaluator = class {
15097
15540
  await proxyShutdown();
15098
15541
  }
15099
15542
  if (outputPath) {
15100
- await (0, import_promises29.rm)((0, import_node_path39.dirname)(outputPath), { recursive: true, force: true }).catch(() => {
15543
+ await (0, import_promises30.rm)((0, import_node_path39.dirname)(outputPath), { recursive: true, force: true }).catch(() => {
15101
15544
  });
15102
15545
  }
15103
15546
  if (imageTmpDir) {
15104
- await (0, import_promises29.rm)(imageTmpDir, { recursive: true, force: true }).catch(() => {
15547
+ await (0, import_promises30.rm)(imageTmpDir, { recursive: true, force: true }).catch(() => {
15105
15548
  });
15106
15549
  }
15107
15550
  }
@@ -15134,7 +15577,7 @@ var import_ai3 = require("ai");
15134
15577
 
15135
15578
  // src/evaluation/evaluators/llm-grader.ts
15136
15579
  init_cjs_shims();
15137
- var import_promises30 = __toESM(require("fs/promises"), 1);
15580
+ var import_promises31 = __toESM(require("fs/promises"), 1);
15138
15581
  var import_node_path40 = __toESM(require("path"), 1);
15139
15582
  var import_ai2 = require("ai");
15140
15583
  var import_zod4 = require("zod");
@@ -16185,7 +16628,7 @@ function createFilesystemTools(workspacePath) {
16185
16628
  execute: async (input) => {
16186
16629
  try {
16187
16630
  const resolved = resolveSandboxed(workspacePath, input.path);
16188
- const entries = await import_promises30.default.readdir(resolved, { withFileTypes: true });
16631
+ const entries = await import_promises31.default.readdir(resolved, { withFileTypes: true });
16189
16632
  return entries.map((e) => ({
16190
16633
  name: e.name,
16191
16634
  type: e.isDirectory() ? "directory" : "file"
@@ -16203,20 +16646,20 @@ function createFilesystemTools(workspacePath) {
16203
16646
  execute: async (input) => {
16204
16647
  try {
16205
16648
  const resolved = resolveSandboxed(workspacePath, input.path);
16206
- const stat11 = await import_promises30.default.stat(resolved);
16207
- if (stat11.isDirectory()) {
16649
+ const stat12 = await import_promises31.default.stat(resolved);
16650
+ if (stat12.isDirectory()) {
16208
16651
  return { error: `'${input.path}' is a directory, not a file` };
16209
16652
  }
16210
- const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
16211
- const fd = await import_promises30.default.open(resolved, "r");
16653
+ const buffer = Buffer.alloc(Math.min(stat12.size, MAX_FILE_SIZE));
16654
+ const fd = await import_promises31.default.open(resolved, "r");
16212
16655
  try {
16213
16656
  await fd.read(buffer, 0, buffer.length, 0);
16214
16657
  } finally {
16215
16658
  await fd.close();
16216
16659
  }
16217
16660
  const content = buffer.toString("utf-8");
16218
- const truncated = stat11.size > MAX_FILE_SIZE;
16219
- return { content, truncated, size: stat11.size };
16661
+ const truncated = stat12.size > MAX_FILE_SIZE;
16662
+ return { content, truncated, size: stat12.size };
16220
16663
  } catch (error) {
16221
16664
  return { error: error instanceof Error ? error.message : String(error) };
16222
16665
  }
@@ -16253,7 +16696,7 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
16253
16696
  if (matches.length >= MAX_SEARCH_MATCHES) return;
16254
16697
  let entries;
16255
16698
  try {
16256
- entries = await import_promises30.default.readdir(dirPath, { withFileTypes: true });
16699
+ entries = await import_promises31.default.readdir(dirPath, { withFileTypes: true });
16257
16700
  } catch {
16258
16701
  return;
16259
16702
  }
@@ -16267,9 +16710,9 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
16267
16710
  const ext = import_node_path40.default.extname(entry.name).toLowerCase();
16268
16711
  if (BINARY_EXTENSIONS.has(ext)) continue;
16269
16712
  try {
16270
- const stat11 = await import_promises30.default.stat(fullPath);
16271
- if (stat11.size > MAX_FILE_SIZE) continue;
16272
- const content = await import_promises30.default.readFile(fullPath, "utf-8");
16713
+ const stat12 = await import_promises31.default.stat(fullPath);
16714
+ if (stat12.size > MAX_FILE_SIZE) continue;
16715
+ const content = await import_promises31.default.readFile(fullPath, "utf-8");
16273
16716
  const lines = content.split("\n");
16274
16717
  for (let i = 0; i < lines.length; i++) {
16275
16718
  if (matches.length >= MAX_SEARCH_MATCHES) return;
@@ -16912,115 +17355,115 @@ var FieldAccuracyEvaluator = class {
16912
17355
  * Evaluate a single field against the expected value.
16913
17356
  */
16914
17357
  evaluateField(fieldConfig, candidateData, expectedData) {
16915
- const { path: path55, match, required = true, weight = 1 } = fieldConfig;
16916
- const candidateValue = resolvePath(candidateData, path55);
16917
- const expectedValue = resolvePath(expectedData, path55);
17358
+ const { path: path56, match, required = true, weight = 1 } = fieldConfig;
17359
+ const candidateValue = resolvePath(candidateData, path56);
17360
+ const expectedValue = resolvePath(expectedData, path56);
16918
17361
  if (expectedValue === void 0) {
16919
17362
  return {
16920
- path: path55,
17363
+ path: path56,
16921
17364
  score: 1,
16922
17365
  // No expected value means no comparison needed
16923
17366
  weight,
16924
17367
  hit: true,
16925
- message: `${path55}: no expected value`
17368
+ message: `${path56}: no expected value`
16926
17369
  };
16927
17370
  }
16928
17371
  if (candidateValue === void 0) {
16929
17372
  if (required) {
16930
17373
  return {
16931
- path: path55,
17374
+ path: path56,
16932
17375
  score: 0,
16933
17376
  weight,
16934
17377
  hit: false,
16935
- message: `${path55} (required, missing)`
17378
+ message: `${path56} (required, missing)`
16936
17379
  };
16937
17380
  }
16938
17381
  return {
16939
- path: path55,
17382
+ path: path56,
16940
17383
  score: 1,
16941
17384
  // Don't penalize missing optional fields
16942
17385
  weight: 0,
16943
17386
  // Zero weight means it won't affect the score
16944
17387
  hit: true,
16945
- message: `${path55}: optional field missing`
17388
+ message: `${path56}: optional field missing`
16946
17389
  };
16947
17390
  }
16948
17391
  switch (match) {
16949
17392
  case "exact":
16950
- return this.compareExact(path55, candidateValue, expectedValue, weight);
17393
+ return this.compareExact(path56, candidateValue, expectedValue, weight);
16951
17394
  case "numeric_tolerance":
16952
17395
  return this.compareNumericTolerance(
16953
- path55,
17396
+ path56,
16954
17397
  candidateValue,
16955
17398
  expectedValue,
16956
17399
  fieldConfig,
16957
17400
  weight
16958
17401
  );
16959
17402
  case "date":
16960
- return this.compareDate(path55, candidateValue, expectedValue, fieldConfig, weight);
17403
+ return this.compareDate(path56, candidateValue, expectedValue, fieldConfig, weight);
16961
17404
  default:
16962
17405
  return {
16963
- path: path55,
17406
+ path: path56,
16964
17407
  score: 0,
16965
17408
  weight,
16966
17409
  hit: false,
16967
- message: `${path55}: unknown match type "${match}"`
17410
+ message: `${path56}: unknown match type "${match}"`
16968
17411
  };
16969
17412
  }
16970
17413
  }
16971
17414
  /**
16972
17415
  * Exact equality comparison.
16973
17416
  */
16974
- compareExact(path55, candidateValue, expectedValue, weight) {
17417
+ compareExact(path56, candidateValue, expectedValue, weight) {
16975
17418
  if (deepEqual(candidateValue, expectedValue)) {
16976
17419
  return {
16977
- path: path55,
17420
+ path: path56,
16978
17421
  score: 1,
16979
17422
  weight,
16980
17423
  hit: true,
16981
- message: path55
17424
+ message: path56
16982
17425
  };
16983
17426
  }
16984
17427
  if (typeof candidateValue !== typeof expectedValue) {
16985
17428
  return {
16986
- path: path55,
17429
+ path: path56,
16987
17430
  score: 0,
16988
17431
  weight,
16989
17432
  hit: false,
16990
- message: `${path55} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
17433
+ message: `${path56} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
16991
17434
  };
16992
17435
  }
16993
17436
  return {
16994
- path: path55,
17437
+ path: path56,
16995
17438
  score: 0,
16996
17439
  weight,
16997
17440
  hit: false,
16998
- message: `${path55} (value mismatch)`
17441
+ message: `${path56} (value mismatch)`
16999
17442
  };
17000
17443
  }
17001
17444
  /**
17002
17445
  * Numeric comparison with absolute or relative tolerance.
17003
17446
  */
17004
- compareNumericTolerance(path55, candidateValue, expectedValue, fieldConfig, weight) {
17447
+ compareNumericTolerance(path56, candidateValue, expectedValue, fieldConfig, weight) {
17005
17448
  const { tolerance = 0, relative = false } = fieldConfig;
17006
17449
  const candidateNum = toNumber(candidateValue);
17007
17450
  const expectedNum = toNumber(expectedValue);
17008
17451
  if (candidateNum === null || expectedNum === null) {
17009
17452
  return {
17010
- path: path55,
17453
+ path: path56,
17011
17454
  score: 0,
17012
17455
  weight,
17013
17456
  hit: false,
17014
- message: `${path55} (non-numeric value)`
17457
+ message: `${path56} (non-numeric value)`
17015
17458
  };
17016
17459
  }
17017
17460
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
17018
17461
  return {
17019
- path: path55,
17462
+ path: path56,
17020
17463
  score: 0,
17021
17464
  weight,
17022
17465
  hit: false,
17023
- message: `${path55} (invalid numeric value)`
17466
+ message: `${path56} (invalid numeric value)`
17024
17467
  };
17025
17468
  }
17026
17469
  const diff = Math.abs(candidateNum - expectedNum);
@@ -17033,61 +17476,61 @@ var FieldAccuracyEvaluator = class {
17033
17476
  }
17034
17477
  if (withinTolerance) {
17035
17478
  return {
17036
- path: path55,
17479
+ path: path56,
17037
17480
  score: 1,
17038
17481
  weight,
17039
17482
  hit: true,
17040
- message: `${path55} (within tolerance: diff=${diff.toFixed(2)})`
17483
+ message: `${path56} (within tolerance: diff=${diff.toFixed(2)})`
17041
17484
  };
17042
17485
  }
17043
17486
  return {
17044
- path: path55,
17487
+ path: path56,
17045
17488
  score: 0,
17046
17489
  weight,
17047
17490
  hit: false,
17048
- message: `${path55} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
17491
+ message: `${path56} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
17049
17492
  };
17050
17493
  }
17051
17494
  /**
17052
17495
  * Date comparison with format normalization.
17053
17496
  */
17054
- compareDate(path55, candidateValue, expectedValue, fieldConfig, weight) {
17497
+ compareDate(path56, candidateValue, expectedValue, fieldConfig, weight) {
17055
17498
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
17056
17499
  const candidateDate = parseDate(String(candidateValue), formats);
17057
17500
  const expectedDate = parseDate(String(expectedValue), formats);
17058
17501
  if (candidateDate === null) {
17059
17502
  return {
17060
- path: path55,
17503
+ path: path56,
17061
17504
  score: 0,
17062
17505
  weight,
17063
17506
  hit: false,
17064
- message: `${path55} (unparseable candidate date)`
17507
+ message: `${path56} (unparseable candidate date)`
17065
17508
  };
17066
17509
  }
17067
17510
  if (expectedDate === null) {
17068
17511
  return {
17069
- path: path55,
17512
+ path: path56,
17070
17513
  score: 0,
17071
17514
  weight,
17072
17515
  hit: false,
17073
- message: `${path55} (unparseable expected date)`
17516
+ message: `${path56} (unparseable expected date)`
17074
17517
  };
17075
17518
  }
17076
17519
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
17077
17520
  return {
17078
- path: path55,
17521
+ path: path56,
17079
17522
  score: 1,
17080
17523
  weight,
17081
17524
  hit: true,
17082
- message: path55
17525
+ message: path56
17083
17526
  };
17084
17527
  }
17085
17528
  return {
17086
- path: path55,
17529
+ path: path56,
17087
17530
  score: 0,
17088
17531
  weight,
17089
17532
  hit: false,
17090
- message: `${path55} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
17533
+ message: `${path56} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
17091
17534
  };
17092
17535
  }
17093
17536
  /**
@@ -17120,11 +17563,11 @@ var FieldAccuracyEvaluator = class {
17120
17563
  };
17121
17564
  }
17122
17565
  };
17123
- function resolvePath(obj, path55) {
17124
- if (!path55 || !obj) {
17566
+ function resolvePath(obj, path56) {
17567
+ if (!path56 || !obj) {
17125
17568
  return void 0;
17126
17569
  }
17127
- const parts = path55.split(/\.|\[|\]/).filter((p) => p.length > 0);
17570
+ const parts = path56.split(/\.|\[|\]/).filter((p) => p.length > 0);
17128
17571
  let current = obj;
17129
17572
  for (const part of parts) {
17130
17573
  if (current === null || current === void 0) {
@@ -17621,8 +18064,8 @@ var TokenUsageEvaluator = class {
17621
18064
 
17622
18065
  // src/evaluation/evaluators/tool-trajectory.ts
17623
18066
  init_cjs_shims();
17624
- function getNestedValue(obj, path55) {
17625
- const parts = path55.split(".");
18067
+ function getNestedValue(obj, path56) {
18068
+ const parts = path56.split(".");
17626
18069
  let current = obj;
17627
18070
  for (const part of parts) {
17628
18071
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -18246,7 +18689,7 @@ function runEqualsAssertion(output, value) {
18246
18689
  init_cjs_shims();
18247
18690
  var import_node_crypto11 = require("crypto");
18248
18691
  var import_node_fs16 = require("fs");
18249
- var import_promises34 = require("fs/promises");
18692
+ var import_promises35 = require("fs/promises");
18250
18693
  var import_node_path49 = __toESM(require("path"), 1);
18251
18694
  var import_micromatch3 = __toESM(require("micromatch"), 1);
18252
18695
 
@@ -18503,6 +18946,15 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
18503
18946
  }
18504
18947
  return void 0;
18505
18948
  }
18949
+ function containsTemplateVariables(text) {
18950
+ const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
18951
+ for (const match of text.matchAll(variablePattern)) {
18952
+ if (VALID_TEMPLATE_VARIABLES.has(match[1])) {
18953
+ return true;
18954
+ }
18955
+ }
18956
+ return false;
18957
+ }
18506
18958
  async function executePromptTemplate(script, context2, config, timeoutMs) {
18507
18959
  const payload = {
18508
18960
  criteria: context2.evalCase.criteria,
@@ -18575,9 +19027,20 @@ var llmGraderFactory = (config, context2) => {
18575
19027
  },
18576
19028
  agentTimeoutMs
18577
19029
  );
19030
+ const isFromInlinePrompt = !c.resolvedPromptScript?.length && !c.resolvedPromptPath && !c.promptPath;
19031
+ let evaluatorTemplateOverride;
19032
+ let evalCase = evalContext.evalCase;
19033
+ if (customPrompt) {
19034
+ if (!isFromInlinePrompt || containsTemplateVariables(customPrompt)) {
19035
+ evaluatorTemplateOverride = customPrompt;
19036
+ } else {
19037
+ evalCase = { ...evalCase, criteria: customPrompt };
19038
+ }
19039
+ }
18578
19040
  return evaluator.evaluate({
18579
19041
  ...evalContext,
18580
- evaluatorTemplateOverride: customPrompt,
19042
+ evalCase,
19043
+ evaluatorTemplateOverride,
18581
19044
  evaluator: c
18582
19045
  });
18583
19046
  }
@@ -19070,7 +19533,7 @@ async function stageNestedRepoChanges(workspacePath) {
19070
19533
 
19071
19534
  // src/evaluation/workspace/manager.ts
19072
19535
  init_cjs_shims();
19073
- var import_promises31 = require("fs/promises");
19536
+ var import_promises32 = require("fs/promises");
19074
19537
  var import_node_path45 = __toESM(require("path"), 1);
19075
19538
  var TemplateNotFoundError = class extends Error {
19076
19539
  constructor(templatePath) {
@@ -19093,7 +19556,7 @@ var WorkspaceCreationError = class extends Error {
19093
19556
  };
19094
19557
  async function isDirectory(filePath) {
19095
19558
  try {
19096
- const stats = await (0, import_promises31.stat)(filePath);
19559
+ const stats = await (0, import_promises32.stat)(filePath);
19097
19560
  return stats.isDirectory();
19098
19561
  } catch {
19099
19562
  return false;
@@ -19104,8 +19567,8 @@ function getWorkspacePath(evalRunId, caseId, workspaceRoot) {
19104
19567
  return import_node_path45.default.join(root, evalRunId, caseId);
19105
19568
  }
19106
19569
  async function copyDirectoryRecursive(src, dest) {
19107
- await (0, import_promises31.mkdir)(dest, { recursive: true });
19108
- const entries = await (0, import_promises31.readdir)(src, { withFileTypes: true });
19570
+ await (0, import_promises32.mkdir)(dest, { recursive: true });
19571
+ const entries = await (0, import_promises32.readdir)(src, { withFileTypes: true });
19109
19572
  for (const entry of entries) {
19110
19573
  const srcPath = import_node_path45.default.join(src, entry.name);
19111
19574
  const destPath = import_node_path45.default.join(dest, entry.name);
@@ -19115,7 +19578,7 @@ async function copyDirectoryRecursive(src, dest) {
19115
19578
  if (entry.isDirectory()) {
19116
19579
  await copyDirectoryRecursive(srcPath, destPath);
19117
19580
  } else {
19118
- await (0, import_promises31.cp)(srcPath, destPath, { preserveTimestamps: true });
19581
+ await (0, import_promises32.cp)(srcPath, destPath, { preserveTimestamps: true });
19119
19582
  }
19120
19583
  }
19121
19584
  }
@@ -19130,7 +19593,7 @@ async function createTempWorkspace(templatePath, evalRunId, caseId, workspaceRoo
19130
19593
  const workspacePath = getWorkspacePath(evalRunId, caseId, workspaceRoot);
19131
19594
  try {
19132
19595
  if (await fileExists2(workspacePath)) {
19133
- await (0, import_promises31.rm)(workspacePath, { recursive: true, force: true });
19596
+ await (0, import_promises32.rm)(workspacePath, { recursive: true, force: true });
19134
19597
  }
19135
19598
  await copyDirectoryRecursive(resolvedTemplatePath, workspacePath);
19136
19599
  return workspacePath;
@@ -19164,14 +19627,14 @@ async function createTempWorkspace(templatePath, evalRunId, caseId, workspaceRoo
19164
19627
  }
19165
19628
  async function cleanupWorkspace(workspacePath) {
19166
19629
  if (await fileExists2(workspacePath)) {
19167
- await (0, import_promises31.rm)(workspacePath, { recursive: true, force: true });
19630
+ await (0, import_promises32.rm)(workspacePath, { recursive: true, force: true });
19168
19631
  }
19169
19632
  }
19170
19633
  async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
19171
19634
  const root = workspaceRoot ?? getWorkspacesRoot();
19172
19635
  const evalDir = import_node_path45.default.join(root, evalRunId);
19173
19636
  if (await fileExists2(evalDir)) {
19174
- await (0, import_promises31.rm)(evalDir, { recursive: true, force: true });
19637
+ await (0, import_promises32.rm)(evalDir, { recursive: true, force: true });
19175
19638
  }
19176
19639
  }
19177
19640
 
@@ -19180,7 +19643,7 @@ init_cjs_shims();
19180
19643
  var import_node_child_process9 = require("child_process");
19181
19644
  var import_node_crypto10 = require("crypto");
19182
19645
  var import_node_fs14 = require("fs");
19183
- var import_promises32 = require("fs/promises");
19646
+ var import_promises33 = require("fs/promises");
19184
19647
  var import_node_path46 = __toESM(require("path"), 1);
19185
19648
  var import_node_util5 = require("util");
19186
19649
  var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process9.execFile);
@@ -19208,12 +19671,14 @@ async function git(args, opts) {
19208
19671
  return stdout.trim();
19209
19672
  }
19210
19673
  function normalizeRepoForFingerprint(repo) {
19211
- const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
19212
- const result = {
19213
- path: repo.path,
19214
- source,
19215
- ref: repo.checkout?.ref ?? "HEAD"
19216
- };
19674
+ const result = {};
19675
+ if (repo.path) {
19676
+ result.path = repo.path;
19677
+ }
19678
+ if (repo.source) {
19679
+ result.source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
19680
+ }
19681
+ result.ref = getRepoCheckoutRef(repo.checkout);
19217
19682
  if (repo.clone?.depth !== void 0) {
19218
19683
  result.depth = repo.clone.depth;
19219
19684
  }
@@ -19227,13 +19692,13 @@ function normalizeRepoForFingerprint(repo) {
19227
19692
  }
19228
19693
  function computeWorkspaceFingerprint(repos) {
19229
19694
  const canonical = {
19230
- repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
19695
+ repos: [...repos].sort((a, b) => (a.path ?? "").localeCompare(b.path ?? "")).map(normalizeRepoForFingerprint)
19231
19696
  };
19232
19697
  return (0, import_node_crypto10.createHash)("sha256").update(JSON.stringify(canonical)).digest("hex");
19233
19698
  }
19234
19699
  async function copyDirectoryRecursive2(src, dest, skipDirs) {
19235
- await (0, import_promises32.mkdir)(dest, { recursive: true });
19236
- const entries = await (0, import_promises32.readdir)(src, { withFileTypes: true });
19700
+ await (0, import_promises33.mkdir)(dest, { recursive: true });
19701
+ const entries = await (0, import_promises33.readdir)(src, { withFileTypes: true });
19237
19702
  for (const entry of entries) {
19238
19703
  const srcPath = import_node_path46.default.join(src, entry.name);
19239
19704
  const destPath = import_node_path46.default.join(dest, entry.name);
@@ -19246,7 +19711,7 @@ async function copyDirectoryRecursive2(src, dest, skipDirs) {
19246
19711
  }
19247
19712
  await copyDirectoryRecursive2(srcPath, destPath, skipDirs);
19248
19713
  } else {
19249
- await (0, import_promises32.cp)(srcPath, destPath, { preserveTimestamps: true, force: true });
19714
+ await (0, import_promises33.cp)(srcPath, destPath, { preserveTimestamps: true, force: true });
19250
19715
  }
19251
19716
  }
19252
19717
  }
@@ -19270,7 +19735,7 @@ var WorkspacePoolManager = class {
19270
19735
  const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
19271
19736
  const fingerprint = computeWorkspaceFingerprint(repos);
19272
19737
  const poolDir = import_node_path46.default.join(this.poolRoot, fingerprint);
19273
- await (0, import_promises32.mkdir)(poolDir, { recursive: true });
19738
+ await (0, import_promises33.mkdir)(poolDir, { recursive: true });
19274
19739
  const drifted = await this.checkDrift(poolDir, fingerprint);
19275
19740
  if (drifted) {
19276
19741
  console.warn(
@@ -19297,7 +19762,7 @@ var WorkspacePoolManager = class {
19297
19762
  poolDir
19298
19763
  };
19299
19764
  }
19300
- await (0, import_promises32.mkdir)(slotPath, { recursive: true });
19765
+ await (0, import_promises33.mkdir)(slotPath, { recursive: true });
19301
19766
  if (templatePath) {
19302
19767
  await copyDirectoryRecursive2(templatePath, slotPath);
19303
19768
  }
@@ -19321,7 +19786,7 @@ var WorkspacePoolManager = class {
19321
19786
  /** Remove lock file to release a slot. */
19322
19787
  async releaseSlot(slot) {
19323
19788
  try {
19324
- await (0, import_promises32.unlink)(slot.lockPath);
19789
+ await (0, import_promises33.unlink)(slot.lockPath);
19325
19790
  } catch {
19326
19791
  }
19327
19792
  }
@@ -19334,21 +19799,21 @@ var WorkspacePoolManager = class {
19334
19799
  async tryLock(lockPath) {
19335
19800
  for (let attempt = 0; attempt < 3; attempt++) {
19336
19801
  try {
19337
- await (0, import_promises32.writeFile)(lockPath, String(process.pid), { flag: "wx" });
19802
+ await (0, import_promises33.writeFile)(lockPath, String(process.pid), { flag: "wx" });
19338
19803
  return true;
19339
19804
  } catch (err) {
19340
19805
  if (err.code !== "EEXIST") {
19341
19806
  throw err;
19342
19807
  }
19343
19808
  try {
19344
- const pidStr = await (0, import_promises32.readFile)(lockPath, "utf-8");
19809
+ const pidStr = await (0, import_promises33.readFile)(lockPath, "utf-8");
19345
19810
  const pid = Number.parseInt(pidStr.trim(), 10);
19346
19811
  if (!Number.isNaN(pid)) {
19347
19812
  try {
19348
19813
  process.kill(pid, 0);
19349
19814
  return false;
19350
19815
  } catch {
19351
- await (0, import_promises32.unlink)(lockPath).catch(() => {
19816
+ await (0, import_promises33.unlink)(lockPath).catch(() => {
19352
19817
  });
19353
19818
  continue;
19354
19819
  }
@@ -19368,7 +19833,7 @@ var WorkspacePoolManager = class {
19368
19833
  async checkDrift(poolDir, fingerprint) {
19369
19834
  const metadataPath = import_node_path46.default.join(poolDir, "metadata.json");
19370
19835
  try {
19371
- const raw = await (0, import_promises32.readFile)(metadataPath, "utf-8");
19836
+ const raw = await (0, import_promises33.readFile)(metadataPath, "utf-8");
19372
19837
  const metadata = JSON.parse(raw);
19373
19838
  return metadata.fingerprint !== fingerprint;
19374
19839
  } catch {
@@ -19383,17 +19848,17 @@ var WorkspacePoolManager = class {
19383
19848
  repos,
19384
19849
  createdAt: (/* @__PURE__ */ new Date()).toISOString()
19385
19850
  };
19386
- await (0, import_promises32.writeFile)(import_node_path46.default.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
19851
+ await (0, import_promises33.writeFile)(import_node_path46.default.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
19387
19852
  }
19388
19853
  /** Remove all slot directories and their lock files from a pool directory. */
19389
19854
  async removeAllSlots(poolDir) {
19390
- const entries = await (0, import_promises32.readdir)(poolDir);
19855
+ const entries = await (0, import_promises33.readdir)(poolDir);
19391
19856
  for (const entry of entries) {
19392
19857
  if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
19393
19858
  const lockPath = import_node_path46.default.join(poolDir, `${entry}.lock`);
19394
19859
  if ((0, import_node_fs14.existsSync)(lockPath)) {
19395
19860
  try {
19396
- const pidStr = await (0, import_promises32.readFile)(lockPath, "utf-8");
19861
+ const pidStr = await (0, import_promises33.readFile)(lockPath, "utf-8");
19397
19862
  const pid = Number.parseInt(pidStr.trim(), 10);
19398
19863
  if (!Number.isNaN(pid)) {
19399
19864
  try {
@@ -19406,12 +19871,12 @@ var WorkspacePoolManager = class {
19406
19871
  } catch {
19407
19872
  }
19408
19873
  }
19409
- await (0, import_promises32.rm)(import_node_path46.default.join(poolDir, entry), { recursive: true, force: true });
19410
- await (0, import_promises32.rm)(lockPath, { force: true }).catch(() => {
19874
+ await (0, import_promises33.rm)(import_node_path46.default.join(poolDir, entry), { recursive: true, force: true });
19875
+ await (0, import_promises33.rm)(lockPath, { force: true }).catch(() => {
19411
19876
  });
19412
19877
  }
19413
19878
  }
19414
- await (0, import_promises32.rm)(import_node_path46.default.join(poolDir, "metadata.json"), { force: true }).catch(() => {
19879
+ await (0, import_promises33.rm)(import_node_path46.default.join(poolDir, "metadata.json"), { force: true }).catch(() => {
19415
19880
  });
19416
19881
  }
19417
19882
  /**
@@ -19421,6 +19886,7 @@ var WorkspacePoolManager = class {
19421
19886
  */
19422
19887
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
19423
19888
  for (const repo of repos) {
19889
+ if (!repo.path || !repo.source) continue;
19424
19890
  const repoDir = import_node_path46.default.join(slotPath, repo.path);
19425
19891
  if (!(0, import_node_fs14.existsSync)(repoDir)) {
19426
19892
  continue;
@@ -19428,7 +19894,7 @@ var WorkspacePoolManager = class {
19428
19894
  if (poolReset === "none") {
19429
19895
  continue;
19430
19896
  }
19431
- const ref = repo.checkout?.ref ?? "HEAD";
19897
+ const ref = getRepoCheckoutRef(repo.checkout);
19432
19898
  const resolve = repo.checkout?.resolve ?? "remote";
19433
19899
  if (resolve === "remote") {
19434
19900
  const fetchArgs = ["fetch", "origin", ref];
@@ -19445,8 +19911,8 @@ var WorkspacePoolManager = class {
19445
19911
  }
19446
19912
  if (templatePath) {
19447
19913
  const repoDirNames = new Set(
19448
- repos.map((r) => {
19449
- const normalized = r.path.replace(/^\.\//, "");
19914
+ repos.filter((r) => r.path).map((r) => {
19915
+ const normalized = (r.path ?? "").replace(/^\.\//, "");
19450
19916
  return normalized.split("/")[0];
19451
19917
  })
19452
19918
  );
@@ -19502,17 +19968,17 @@ var RepoManager = class {
19502
19968
  static validateLocalPaths(repos) {
19503
19969
  const errors = [];
19504
19970
  for (const repo of repos) {
19505
- if (repo.source.type !== "local") continue;
19971
+ if (!repo.source || repo.source.type !== "local") continue;
19506
19972
  const sourcePath = repo.source.path;
19507
19973
  if (!sourcePath || sourcePath.trim() === "") {
19508
19974
  errors.push({
19509
- repoPath: repo.path,
19975
+ repoPath: repo.path ?? "(none)",
19510
19976
  resolvedSourcePath: sourcePath ?? "",
19511
19977
  reason: "empty_path"
19512
19978
  });
19513
19979
  } else if (!(0, import_node_fs15.existsSync)(sourcePath)) {
19514
19980
  errors.push({
19515
- repoPath: repo.path,
19981
+ repoPath: repo.path ?? "(none)",
19516
19982
  resolvedSourcePath: sourcePath,
19517
19983
  reason: "not_found"
19518
19984
  });
@@ -19559,6 +20025,12 @@ ${lines.join("\n")}`;
19559
20025
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
19560
20026
  */
19561
20027
  async materialize(repo, workspacePath) {
20028
+ if (!repo.source || !repo.path) {
20029
+ if (this.verbose) {
20030
+ console.log(`[repo] materialize skip path=${repo.path ?? "(none)"} (no source or path)`);
20031
+ }
20032
+ return;
20033
+ }
19562
20034
  const targetDir = import_node_path47.default.join(workspacePath, repo.path);
19563
20035
  const sourceUrl = getSourceUrl(repo.source);
19564
20036
  const startedAt = Date.now();
@@ -19582,7 +20054,7 @@ ${lines.join("\n")}`;
19582
20054
  await this.runGit(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
19583
20055
  await this.runGit(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
19584
20056
  }
19585
- const ref = repo.checkout?.ref ?? "HEAD";
20057
+ const ref = getRepoCheckoutRef(repo.checkout);
19586
20058
  const resolve = repo.checkout?.resolve ?? "remote";
19587
20059
  let resolvedSha;
19588
20060
  if (resolve === "remote" && repo.source.type === "git") {
@@ -19634,22 +20106,26 @@ ${lines.join("\n")}`;
19634
20106
  );
19635
20107
  }
19636
20108
  }
19637
- /** Materialize all repos into the workspace. */
20109
+ /** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
19638
20110
  async materializeAll(repos, workspacePath) {
20111
+ const materializableRepos = repos.filter((r) => r.source);
19639
20112
  if (this.verbose) {
19640
- console.log(`[repo] materializeAll count=${repos.length} workspace=${workspacePath}`);
20113
+ console.log(
20114
+ `[repo] materializeAll count=${materializableRepos.length} (${repos.length - materializableRepos.length} skipped, no source) workspace=${workspacePath}`
20115
+ );
19641
20116
  }
19642
- for (const repo of repos) {
20117
+ for (const repo of materializableRepos) {
19643
20118
  await this.materialize(repo, workspacePath);
19644
20119
  }
19645
20120
  if (this.verbose) {
19646
20121
  console.log("[repo] materializeAll complete");
19647
20122
  }
19648
20123
  }
19649
- /** Reset repos in workspace to their checkout state. */
20124
+ /** Reset repos in workspace to their checkout state. Skips repos without path or source. */
19650
20125
  async reset(repos, workspacePath, reset) {
19651
20126
  const cleanFlag = reset === "strict" ? "-fdx" : "-fd";
19652
20127
  for (const repo of repos) {
20128
+ if (!repo.path || !repo.source) continue;
19653
20129
  const targetDir = import_node_path47.default.join(workspacePath, repo.path);
19654
20130
  await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
19655
20131
  await this.runGit(["clean", cleanFlag], { cwd: targetDir });
@@ -19659,14 +20135,14 @@ ${lines.join("\n")}`;
19659
20135
 
19660
20136
  // src/evaluation/workspace/resolve.ts
19661
20137
  init_cjs_shims();
19662
- var import_promises33 = require("fs/promises");
20138
+ var import_promises34 = require("fs/promises");
19663
20139
  var import_node_path48 = __toESM(require("path"), 1);
19664
20140
  async function resolveWorkspaceTemplate(templatePath) {
19665
20141
  if (!templatePath) {
19666
20142
  return void 0;
19667
20143
  }
19668
20144
  const resolved = import_node_path48.default.resolve(templatePath);
19669
- const stats = await (0, import_promises33.stat)(resolved);
20145
+ const stats = await (0, import_promises34.stat)(resolved);
19670
20146
  if (stats.isFile()) {
19671
20147
  return {
19672
20148
  dir: import_node_path48.default.dirname(resolved),
@@ -19676,7 +20152,7 @@ async function resolveWorkspaceTemplate(templatePath) {
19676
20152
  if (!stats.isDirectory()) {
19677
20153
  throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
19678
20154
  }
19679
- const entries = await (0, import_promises33.readdir)(resolved);
20155
+ const entries = await (0, import_promises34.readdir)(resolved);
19680
20156
  const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
19681
20157
  if (workspaceFiles.length === 1) {
19682
20158
  return {
@@ -19696,6 +20172,7 @@ async function resolveWorkspaceTemplate(templatePath) {
19696
20172
 
19697
20173
  // src/evaluation/workspace/script-executor.ts
19698
20174
  init_cjs_shims();
20175
+ init_exec();
19699
20176
  function interpolateArgs(args, context2) {
19700
20177
  const vars = {
19701
20178
  workspace_path: context2.workspacePath,
@@ -19975,7 +20452,8 @@ async function runEvaluation(options) {
19975
20452
  for (const ec of filteredEvalCases) {
19976
20453
  if (ec.workspace?.repos) {
19977
20454
  for (const repo of ec.workspace.repos) {
19978
- const key = `${repo.path}::${repo.source.type === "local" ? repo.source.path : ""}`;
20455
+ if (!repo.source) continue;
20456
+ const key = `${repo.path ?? ""}::${repo.source.type === "local" ? repo.source.path : ""}`;
19979
20457
  if (!allRepos.has(key)) {
19980
20458
  allRepos.set(key, repo);
19981
20459
  }
@@ -19988,7 +20466,7 @@ async function runEvaluation(options) {
19988
20466
  const message = RepoManager.formatValidationErrors(localPathErrors);
19989
20467
  console.warn(`Warning: ${message}`);
19990
20468
  const invalidLocalRepoPaths = new Set(localPathErrors.map((e) => e.repoPath));
19991
- if (suiteWorkspace?.repos?.some((r) => invalidLocalRepoPaths.has(r.path))) {
20469
+ if (suiteWorkspace?.repos?.some((r) => r.path && invalidLocalRepoPaths.has(r.path))) {
19992
20470
  throw new Error(message);
19993
20471
  }
19994
20472
  }
@@ -20049,14 +20527,14 @@ async function runEvaluation(options) {
20049
20527
  let staticMaterialised = false;
20050
20528
  const isYamlConfiguredPath = !cliWorkspacePath && !!yamlWorkspacePath;
20051
20529
  if (useStaticWorkspace && configuredStaticPath) {
20052
- const dirExists = await (0, import_promises34.stat)(configuredStaticPath).then(
20530
+ const dirExists = await (0, import_promises35.stat)(configuredStaticPath).then(
20053
20531
  (s) => s.isDirectory(),
20054
20532
  () => false
20055
20533
  );
20056
- const isEmpty = dirExists ? (await (0, import_promises34.readdir)(configuredStaticPath)).length === 0 : false;
20534
+ const isEmpty = dirExists ? (await (0, import_promises35.readdir)(configuredStaticPath)).length === 0 : false;
20057
20535
  if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
20058
20536
  if (!dirExists) {
20059
- await (0, import_promises34.mkdir)(configuredStaticPath, { recursive: true });
20537
+ await (0, import_promises35.mkdir)(configuredStaticPath, { recursive: true });
20060
20538
  }
20061
20539
  if (workspaceTemplate) {
20062
20540
  await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
@@ -20101,14 +20579,14 @@ async function runEvaluation(options) {
20101
20579
  }
20102
20580
  } else if (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
20103
20581
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
20104
- await (0, import_promises34.mkdir)(sharedWorkspacePath, { recursive: true });
20582
+ await (0, import_promises35.mkdir)(sharedWorkspacePath, { recursive: true });
20105
20583
  setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
20106
20584
  }
20107
20585
  try {
20108
20586
  if (suiteWorkspaceFile && sharedWorkspacePath) {
20109
20587
  const copiedWorkspaceFile = import_node_path49.default.join(sharedWorkspacePath, import_node_path49.default.basename(suiteWorkspaceFile));
20110
20588
  try {
20111
- await (0, import_promises34.stat)(copiedWorkspaceFile);
20589
+ await (0, import_promises35.stat)(copiedWorkspaceFile);
20112
20590
  suiteWorkspaceFile = copiedWorkspaceFile;
20113
20591
  } catch {
20114
20592
  }
@@ -20121,6 +20599,7 @@ async function runEvaluation(options) {
20121
20599
  try {
20122
20600
  if (needsPerRepoCheck) {
20123
20601
  for (const repo of suiteWorkspace.repos) {
20602
+ if (!repo.path || !repo.source) continue;
20124
20603
  const targetDir = import_node_path49.default.join(sharedWorkspacePath, repo.path);
20125
20604
  if ((0, import_node_fs16.existsSync)(targetDir)) {
20126
20605
  setupLog(`reusing existing repo at: ${targetDir}`);
@@ -20145,6 +20624,19 @@ async function runEvaluation(options) {
20145
20624
  throw new Error(`Failed to materialize repos: ${message}`);
20146
20625
  }
20147
20626
  }
20627
+ const suiteDockerConfig = suiteWorkspace?.docker;
20628
+ if (suiteDockerConfig) {
20629
+ setupLog(`pulling Docker image: ${suiteDockerConfig.image}`);
20630
+ const { DockerWorkspaceProvider: DockerWorkspaceProvider2 } = await Promise.resolve().then(() => (init_docker_workspace(), docker_workspace_exports));
20631
+ const dockerSetup = new DockerWorkspaceProvider2(suiteDockerConfig);
20632
+ if (!await dockerSetup.isDockerAvailable()) {
20633
+ throw new Error(
20634
+ "Docker workspace configured but Docker CLI is not available. Install Docker and ensure it is running."
20635
+ );
20636
+ }
20637
+ await dockerSetup.pullImage();
20638
+ setupLog("Docker image pull complete");
20639
+ }
20148
20640
  const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
20149
20641
  const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
20150
20642
  if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
@@ -20505,11 +20997,9 @@ async function runBatchEvaluation(options) {
20505
20997
  const promptInputs = promptInputsList[index];
20506
20998
  return {
20507
20999
  question: promptInputs.question,
21000
+ systemPrompt: promptInputs.systemMessage,
20508
21001
  inputFiles: evalCase.file_paths,
20509
- evalCaseId: evalCase.id,
20510
- metadata: {
20511
- systemPrompt: promptInputs.systemMessage ?? ""
20512
- }
21002
+ evalCaseId: evalCase.id
20513
21003
  };
20514
21004
  });
20515
21005
  const batchResponse = await provider.invokeBatch?.(batchRequests);
@@ -20710,7 +21200,7 @@ async function runEvalCase(options) {
20710
21200
  if (caseWorkspaceFile && workspacePath) {
20711
21201
  const copiedFile = import_node_path49.default.join(workspacePath, import_node_path49.default.basename(caseWorkspaceFile));
20712
21202
  try {
20713
- await (0, import_promises34.stat)(copiedFile);
21203
+ await (0, import_promises35.stat)(copiedFile);
20714
21204
  caseWorkspaceFile = copiedFile;
20715
21205
  } catch {
20716
21206
  }
@@ -20718,7 +21208,7 @@ async function runEvalCase(options) {
20718
21208
  }
20719
21209
  if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
20720
21210
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
20721
- await (0, import_promises34.mkdir)(workspacePath, { recursive: true });
21211
+ await (0, import_promises35.mkdir)(workspacePath, { recursive: true });
20722
21212
  }
20723
21213
  if (evalCase.workspace?.repos?.length && workspacePath) {
20724
21214
  const localPathErrors = RepoManager.validateLocalPaths(evalCase.workspace.repos);
@@ -20773,8 +21263,8 @@ async function runEvalCase(options) {
20773
21263
  const srcPath = import_node_path49.default.resolve(baseDir, relPath);
20774
21264
  const destPath = import_node_path49.default.resolve(workspacePath, relPath);
20775
21265
  try {
20776
- await (0, import_promises34.mkdir)(import_node_path49.default.dirname(destPath), { recursive: true });
20777
- await (0, import_promises34.copyFile)(srcPath, destPath);
21266
+ await (0, import_promises35.mkdir)(import_node_path49.default.dirname(destPath), { recursive: true });
21267
+ await (0, import_promises35.copyFile)(srcPath, destPath);
20778
21268
  } catch (error) {
20779
21269
  const message = error instanceof Error ? error.message : String(error);
20780
21270
  return buildErrorResult(
@@ -21040,6 +21530,7 @@ async function runEvalCase(options) {
21040
21530
  availableTargets,
21041
21531
  fileChanges,
21042
21532
  workspacePath,
21533
+ dockerConfig: evalCase.workspace?.docker,
21043
21534
  verbose,
21044
21535
  threshold: evalCase.threshold ?? caseThreshold
21045
21536
  });
@@ -21233,6 +21724,7 @@ async function evaluateCandidate(options) {
21233
21724
  availableTargets,
21234
21725
  fileChanges,
21235
21726
  workspacePath,
21727
+ dockerConfig,
21236
21728
  threshold: evalThreshold
21237
21729
  } = options;
21238
21730
  const gradeTimestamp = nowFn();
@@ -21259,6 +21751,7 @@ async function evaluateCandidate(options) {
21259
21751
  availableTargets,
21260
21752
  fileChanges,
21261
21753
  workspacePath,
21754
+ dockerConfig,
21262
21755
  threshold: evalThreshold
21263
21756
  });
21264
21757
  const completedAt = nowFn();
@@ -21334,6 +21827,7 @@ async function runEvaluatorsForCase(options) {
21334
21827
  availableTargets,
21335
21828
  fileChanges,
21336
21829
  workspacePath,
21830
+ dockerConfig,
21337
21831
  threshold
21338
21832
  } = options;
21339
21833
  if (evalCase.assertions && evalCase.assertions.length > 0) {
@@ -21361,6 +21855,7 @@ async function runEvaluatorsForCase(options) {
21361
21855
  availableTargets,
21362
21856
  fileChanges,
21363
21857
  workspacePath,
21858
+ dockerConfig,
21364
21859
  threshold
21365
21860
  });
21366
21861
  }
@@ -21390,6 +21885,7 @@ async function runEvaluatorsForCase(options) {
21390
21885
  availableTargets,
21391
21886
  fileChanges,
21392
21887
  workspacePath,
21888
+ dockerConfig,
21393
21889
  ...implicitEvaluator ? { evaluator: implicitEvaluator } : {}
21394
21890
  });
21395
21891
  return { score };
@@ -21428,7 +21924,8 @@ async function runEvaluatorList(options) {
21428
21924
  targetResolver,
21429
21925
  availableTargets,
21430
21926
  fileChanges,
21431
- workspacePath
21927
+ workspacePath,
21928
+ dockerConfig
21432
21929
  } = options;
21433
21930
  const scored = [];
21434
21931
  const scores = [];
@@ -21451,7 +21948,8 @@ async function runEvaluatorList(options) {
21451
21948
  targetResolver,
21452
21949
  availableTargets,
21453
21950
  fileChanges,
21454
- workspacePath
21951
+ workspacePath,
21952
+ dockerConfig
21455
21953
  };
21456
21954
  const evalFileDir = evalCase.file_paths[0] ? import_node_path49.default.dirname(evalCase.file_paths[0]) : process.cwd();
21457
21955
  const dispatchContext = {
@@ -21613,13 +22111,11 @@ async function invokeProvider(provider, options) {
21613
22111
  const braintrustSpanIds = streamCallbacks?.getActiveSpanIds?.() ?? void 0;
21614
22112
  return await provider.invoke({
21615
22113
  question: promptInputs.question,
22114
+ systemPrompt: promptInputs.systemMessage,
21616
22115
  chatPrompt: promptInputs.chatPrompt,
21617
22116
  inputFiles: evalCase.file_paths,
21618
22117
  evalCaseId: evalCase.id,
21619
22118
  attempt,
21620
- metadata: {
21621
- systemPrompt: promptInputs.systemMessage ?? ""
21622
- },
21623
22119
  signal: controller.signal,
21624
22120
  cwd,
21625
22121
  workspaceFile,
@@ -21991,7 +22487,7 @@ async function discoverDefaultTarget(repoRoot) {
21991
22487
  return null;
21992
22488
  }
21993
22489
  async function loadEnvHierarchy(repoRoot, startPath) {
21994
- const { readFileSync: readFileSync4 } = await import("fs");
22490
+ const { readFileSync: readFileSync5 } = await import("fs");
21995
22491
  const chain = buildDirectoryChain2(startPath, repoRoot);
21996
22492
  const envFiles = [];
21997
22493
  for (const dir of chain) {
@@ -22000,7 +22496,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
22000
22496
  }
22001
22497
  for (let i = 0; i < envFiles.length; i++) {
22002
22498
  try {
22003
- const content = readFileSync4(envFiles[i], "utf8");
22499
+ const content = readFileSync5(envFiles[i], "utf8");
22004
22500
  for (const line of content.split("\n")) {
22005
22501
  const trimmed = line.trim();
22006
22502
  if (!trimmed || trimmed.startsWith("#")) continue;
@@ -22073,12 +22569,12 @@ var CONFIG_FILE_NAMES = [
22073
22569
  ".agentv/config.js"
22074
22570
  ];
22075
22571
  async function loadTsConfig(projectRoot) {
22076
- const { existsSync: existsSync8 } = await import("fs");
22572
+ const { existsSync: existsSync9 } = await import("fs");
22077
22573
  const { pathToFileURL: pathToFileURL2 } = await import("url");
22078
22574
  const { join: join2 } = await import("path");
22079
22575
  for (const fileName of CONFIG_FILE_NAMES) {
22080
22576
  const filePath = join2(projectRoot, fileName);
22081
- if (!existsSync8(filePath)) {
22577
+ if (!existsSync9(filePath)) {
22082
22578
  continue;
22083
22579
  }
22084
22580
  try {
@@ -22183,9 +22679,9 @@ init_cjs_shims();
22183
22679
 
22184
22680
  // src/evaluation/workspace/deps-scanner.ts
22185
22681
  init_cjs_shims();
22186
- var import_promises35 = require("fs/promises");
22682
+ var import_promises36 = require("fs/promises");
22187
22683
  var import_node_path51 = __toESM(require("path"), 1);
22188
- var import_yaml8 = require("yaml");
22684
+ var import_yaml9 = require("yaml");
22189
22685
  function normalizeGitUrl(url) {
22190
22686
  let normalized = url.replace(/\.git$/, "");
22191
22687
  try {
@@ -22203,7 +22699,7 @@ async function scanRepoDeps(evalFilePaths) {
22203
22699
  try {
22204
22700
  const repos = await extractReposFromEvalFile(filePath);
22205
22701
  for (const repo of repos) {
22206
- if (repo.source.type !== "git") continue;
22702
+ if (!repo.source || repo.source.type !== "git") continue;
22207
22703
  const ref = repo.checkout?.ref;
22208
22704
  const key = `${normalizeGitUrl(repo.source.url)}\0${ref ?? ""}`;
22209
22705
  const existing = seen.get(key);
@@ -22231,8 +22727,8 @@ async function scanRepoDeps(evalFilePaths) {
22231
22727
  return { repos: [...seen.values()], errors };
22232
22728
  }
22233
22729
  async function extractReposFromEvalFile(filePath) {
22234
- const content = await (0, import_promises35.readFile)(filePath, "utf8");
22235
- const parsed = interpolateEnv((0, import_yaml8.parse)(content), process.env);
22730
+ const content = await (0, import_promises36.readFile)(filePath, "utf8");
22731
+ const parsed = interpolateEnv((0, import_yaml9.parse)(content), process.env);
22236
22732
  if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return [];
22237
22733
  const obj = parsed;
22238
22734
  const evalFileDir = import_node_path51.default.dirname(import_node_path51.default.resolve(filePath));
@@ -22252,8 +22748,8 @@ async function extractReposFromEvalFile(filePath) {
22252
22748
  async function extractReposFromWorkspaceRaw(raw, evalFileDir) {
22253
22749
  if (typeof raw === "string") {
22254
22750
  const workspaceFilePath = import_node_path51.default.resolve(evalFileDir, raw);
22255
- const content = await (0, import_promises35.readFile)(workspaceFilePath, "utf8");
22256
- const parsed = interpolateEnv((0, import_yaml8.parse)(content), process.env);
22751
+ const content = await (0, import_promises36.readFile)(workspaceFilePath, "utf8");
22752
+ const parsed = interpolateEnv((0, import_yaml9.parse)(content), process.env);
22257
22753
  if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return [];
22258
22754
  return extractReposFromObject(parsed);
22259
22755
  }
@@ -22279,9 +22775,12 @@ function extractReposFromObject(obj) {
22279
22775
  return result;
22280
22776
  }
22281
22777
 
22778
+ // src/evaluation/workspace/index.ts
22779
+ init_docker_workspace();
22780
+
22282
22781
  // src/evaluation/cache/response-cache.ts
22283
22782
  init_cjs_shims();
22284
- var import_promises36 = require("fs/promises");
22783
+ var import_promises37 = require("fs/promises");
22285
22784
  var import_node_path52 = __toESM(require("path"), 1);
22286
22785
  var DEFAULT_CACHE_PATH = ".agentv/cache";
22287
22786
  var ResponseCache = class {
@@ -22292,7 +22791,7 @@ var ResponseCache = class {
22292
22791
  async get(key) {
22293
22792
  const filePath = this.keyToPath(key);
22294
22793
  try {
22295
- const data = await (0, import_promises36.readFile)(filePath, "utf8");
22794
+ const data = await (0, import_promises37.readFile)(filePath, "utf8");
22296
22795
  return JSON.parse(data);
22297
22796
  } catch {
22298
22797
  return void 0;
@@ -22301,8 +22800,8 @@ var ResponseCache = class {
22301
22800
  async set(key, value) {
22302
22801
  const filePath = this.keyToPath(key);
22303
22802
  const dir = import_node_path52.default.dirname(filePath);
22304
- await (0, import_promises36.mkdir)(dir, { recursive: true });
22305
- await (0, import_promises36.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
22803
+ await (0, import_promises37.mkdir)(dir, { recursive: true });
22804
+ await (0, import_promises37.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
22306
22805
  }
22307
22806
  keyToPath(key) {
22308
22807
  const prefix = key.slice(0, 2);
@@ -22321,22 +22820,304 @@ function shouldSkipCacheForTemperature(targetConfig) {
22321
22820
  return false;
22322
22821
  }
22323
22822
 
22324
- // src/projects.ts
22823
+ // src/evaluation/results-repo.ts
22325
22824
  init_cjs_shims();
22825
+ var import_node_child_process11 = require("child_process");
22326
22826
  var import_node_fs18 = require("fs");
22827
+ var import_promises38 = require("fs/promises");
22828
+ var import_node_os8 = __toESM(require("os"), 1);
22327
22829
  var import_node_path53 = __toESM(require("path"), 1);
22328
- var import_yaml9 = require("yaml");
22830
+ var import_node_util7 = require("util");
22831
+ var execFileAsync3 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
22832
+ function sanitizeRepoSlug(repo) {
22833
+ return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
22834
+ }
22835
+ function withFriendlyGitHubAuthError(error) {
22836
+ const message = error instanceof Error ? error.message : String(error);
22837
+ const lower = message.toLowerCase();
22838
+ if (lower.includes("authentication failed") || lower.includes("could not read username") || lower.includes("permission denied") || lower.includes("not logged into any github hosts")) {
22839
+ return new Error(`${message}. Run 'gh auth login' to authenticate.`);
22840
+ }
22841
+ return new Error(message);
22842
+ }
22843
+ function normalizeResultsExportConfig(config) {
22844
+ return {
22845
+ repo: config.repo.trim(),
22846
+ path: config.path.trim().replace(/^\/+|\/+$/g, ""),
22847
+ auto_push: config.auto_push === true,
22848
+ branch_prefix: config.branch_prefix?.trim() || "eval-results"
22849
+ };
22850
+ }
22851
+ function resolveResultsRepoUrl(repo) {
22852
+ if (repo.includes("://") || repo.startsWith("git@")) {
22853
+ return repo;
22854
+ }
22855
+ return `https://github.com/${repo}.git`;
22856
+ }
22857
+ function getResultsRepoCachePaths(repo) {
22858
+ const rootDir = import_node_path53.default.join(getAgentvHome(), "cache", "results-repo", sanitizeRepoSlug(repo));
22859
+ return {
22860
+ rootDir,
22861
+ repoDir: import_node_path53.default.join(rootDir, "repo"),
22862
+ statusFile: import_node_path53.default.join(rootDir, "status.json")
22863
+ };
22864
+ }
22865
+ function readPersistedStatus(statusFile) {
22866
+ if (!(0, import_node_fs18.existsSync)(statusFile)) {
22867
+ return {};
22868
+ }
22869
+ try {
22870
+ return JSON.parse((0, import_node_fs18.readFileSync)(statusFile, "utf8"));
22871
+ } catch {
22872
+ return {};
22873
+ }
22874
+ }
22875
+ function writePersistedStatus(statusFile, status) {
22876
+ (0, import_node_fs18.mkdirSync)(import_node_path53.default.dirname(statusFile), { recursive: true });
22877
+ (0, import_node_fs18.writeFileSync)(statusFile, `${JSON.stringify(status, null, 2)}
22878
+ `, "utf8");
22879
+ }
22880
+ async function runCommand(executable, args, options) {
22881
+ try {
22882
+ const { stdout, stderr } = await execFileAsync3(executable, [...args], {
22883
+ cwd: options?.cwd,
22884
+ env: process.env
22885
+ });
22886
+ return { stdout, stderr };
22887
+ } catch (error) {
22888
+ if (options?.check === false && error && typeof error === "object") {
22889
+ const execError = error;
22890
+ return {
22891
+ stdout: execError.stdout ?? "",
22892
+ stderr: execError.stderr ?? ""
22893
+ };
22894
+ }
22895
+ throw withFriendlyGitHubAuthError(error);
22896
+ }
22897
+ }
22898
+ async function runGit(args, options) {
22899
+ return runCommand("git", args, options);
22900
+ }
22901
+ async function runGh(args, options) {
22902
+ return runCommand("gh", args, options);
22903
+ }
22904
+ async function resolveDefaultBranch(repoDir) {
22905
+ try {
22906
+ const { stdout } = await runGit(["symbolic-ref", "refs/remotes/origin/HEAD"], { cwd: repoDir });
22907
+ const ref = stdout.trim();
22908
+ const prefix = "refs/remotes/origin/";
22909
+ if (ref.startsWith(prefix)) {
22910
+ return ref.slice(prefix.length);
22911
+ }
22912
+ } catch {
22913
+ }
22914
+ for (const candidate of ["main", "master"]) {
22915
+ try {
22916
+ await runGit(["rev-parse", "--verify", `origin/${candidate}`], { cwd: repoDir });
22917
+ return candidate;
22918
+ } catch {
22919
+ }
22920
+ }
22921
+ return "main";
22922
+ }
22923
+ async function updateCacheRepo(repoDir, baseBranch) {
22924
+ await runGit(["fetch", "origin", "--prune"], { cwd: repoDir });
22925
+ await runGit(["checkout", baseBranch], { cwd: repoDir });
22926
+ await runGit(["pull", "--ff-only", "origin", baseBranch], { cwd: repoDir });
22927
+ }
22928
+ function updateStatusFile(config, patch) {
22929
+ const cachePaths = getResultsRepoCachePaths(config.repo);
22930
+ const current = readPersistedStatus(cachePaths.statusFile);
22931
+ writePersistedStatus(cachePaths.statusFile, {
22932
+ ...current,
22933
+ ...patch
22934
+ });
22935
+ }
22936
+ async function ensureResultsRepoClone(config) {
22937
+ const normalized = normalizeResultsExportConfig(config);
22938
+ const cachePaths = getResultsRepoCachePaths(normalized.repo);
22939
+ (0, import_node_fs18.mkdirSync)(cachePaths.rootDir, { recursive: true });
22940
+ if (!(0, import_node_fs18.existsSync)(cachePaths.repoDir)) {
22941
+ try {
22942
+ await runGit([
22943
+ "clone",
22944
+ "--filter=blob:none",
22945
+ resolveResultsRepoUrl(normalized.repo),
22946
+ cachePaths.repoDir
22947
+ ]);
22948
+ return cachePaths.repoDir;
22949
+ } catch (error) {
22950
+ updateStatusFile(normalized, { last_error: withFriendlyGitHubAuthError(error).message });
22951
+ throw withFriendlyGitHubAuthError(error);
22952
+ }
22953
+ }
22954
+ if (!(0, import_node_fs18.existsSync)(import_node_path53.default.join(cachePaths.repoDir, ".git"))) {
22955
+ throw new Error(`Results repo cache is not a git repository: ${cachePaths.repoDir}`);
22956
+ }
22957
+ return cachePaths.repoDir;
22958
+ }
22959
+ function getResultsRepoStatus(config) {
22960
+ if (!config) {
22961
+ return {
22962
+ configured: false,
22963
+ available: false,
22964
+ repo: "",
22965
+ cache_dir: ""
22966
+ };
22967
+ }
22968
+ const normalized = normalizeResultsExportConfig(config);
22969
+ const cachePaths = getResultsRepoCachePaths(normalized.repo);
22970
+ const persisted = readPersistedStatus(cachePaths.statusFile);
22971
+ return {
22972
+ configured: true,
22973
+ available: (0, import_node_fs18.existsSync)(cachePaths.repoDir),
22974
+ repo: normalized.repo,
22975
+ path: normalized.path,
22976
+ auto_push: normalized.auto_push,
22977
+ branch_prefix: normalized.branch_prefix,
22978
+ cache_dir: cachePaths.repoDir,
22979
+ last_synced_at: persisted.last_synced_at,
22980
+ last_error: persisted.last_error
22981
+ };
22982
+ }
22983
+ async function syncResultsRepo(config) {
22984
+ const normalized = normalizeResultsExportConfig(config);
22985
+ try {
22986
+ const repoDir = await ensureResultsRepoClone(normalized);
22987
+ const baseBranch = await resolveDefaultBranch(repoDir);
22988
+ await updateCacheRepo(repoDir, baseBranch);
22989
+ updateStatusFile(normalized, {
22990
+ last_synced_at: (/* @__PURE__ */ new Date()).toISOString(),
22991
+ last_error: void 0
22992
+ });
22993
+ } catch (error) {
22994
+ updateStatusFile(normalized, {
22995
+ last_error: withFriendlyGitHubAuthError(error).message
22996
+ });
22997
+ throw withFriendlyGitHubAuthError(error);
22998
+ }
22999
+ return getResultsRepoStatus(normalized);
23000
+ }
23001
+ async function checkoutResultsRepoBranch(config, branchName) {
23002
+ const normalized = normalizeResultsExportConfig(config);
23003
+ const repoDir = await ensureResultsRepoClone(normalized);
23004
+ const baseBranch = await resolveDefaultBranch(repoDir);
23005
+ await updateCacheRepo(repoDir, baseBranch);
23006
+ await runGit(["checkout", "-B", branchName, `origin/${baseBranch}`], { cwd: repoDir });
23007
+ updateStatusFile(normalized, { last_error: void 0 });
23008
+ return {
23009
+ branchName,
23010
+ baseBranch,
23011
+ repoDir
23012
+ };
23013
+ }
23014
+ async function prepareResultsRepoBranch(config, branchName) {
23015
+ const normalized = normalizeResultsExportConfig(config);
23016
+ const cloneDir = await ensureResultsRepoClone(normalized);
23017
+ const baseBranch = await resolveDefaultBranch(cloneDir);
23018
+ await updateCacheRepo(cloneDir, baseBranch);
23019
+ const worktreeRoot = await (0, import_promises38.mkdtemp)(import_node_path53.default.join(import_node_os8.default.tmpdir(), "agentv-results-repo-"));
23020
+ const worktreeDir = import_node_path53.default.join(worktreeRoot, "repo");
23021
+ await runGit(["worktree", "add", "-B", branchName, worktreeDir, `origin/${baseBranch}`], {
23022
+ cwd: cloneDir
23023
+ });
23024
+ return {
23025
+ branchName,
23026
+ baseBranch,
23027
+ repoDir: worktreeDir,
23028
+ cleanup: async () => {
23029
+ try {
23030
+ await runGit(["worktree", "remove", "--force", worktreeDir], { cwd: cloneDir });
23031
+ } finally {
23032
+ await (0, import_promises38.rm)(worktreeRoot, { recursive: true, force: true }).catch(() => void 0);
23033
+ }
23034
+ }
23035
+ };
23036
+ }
23037
+ async function stageResultsArtifacts(params) {
23038
+ (0, import_node_fs18.rmSync)(params.destinationDir, { recursive: true, force: true });
23039
+ (0, import_node_fs18.mkdirSync)(import_node_path53.default.dirname(params.destinationDir), { recursive: true });
23040
+ await (0, import_promises38.cp)(params.sourceDir, params.destinationDir, { recursive: true });
23041
+ }
23042
+ function resolveResultsRepoRunsDir(config) {
23043
+ const normalized = normalizeResultsExportConfig(config);
23044
+ return import_node_path53.default.join(
23045
+ getResultsRepoCachePaths(normalized.repo).repoDir,
23046
+ ...normalized.path.split("/")
23047
+ );
23048
+ }
23049
+ async function directorySizeBytes(targetPath) {
23050
+ const entry = await (0, import_promises38.stat)(targetPath);
23051
+ if (entry.isFile()) {
23052
+ return entry.size;
23053
+ }
23054
+ let total = 0;
23055
+ for (const child of await (0, import_promises38.readdir)(targetPath, { withFileTypes: true })) {
23056
+ total += await directorySizeBytes(import_node_path53.default.join(targetPath, child.name));
23057
+ }
23058
+ return total;
23059
+ }
23060
+ async function commitAndPushResultsBranch(params) {
23061
+ await runGit(["add", "--all"], { cwd: params.repoDir });
23062
+ const { stdout: diffStdout } = await runGit(["status", "--porcelain"], {
23063
+ cwd: params.repoDir,
23064
+ check: false
23065
+ });
23066
+ if (diffStdout.trim().length === 0) {
23067
+ return false;
23068
+ }
23069
+ await runGit(["commit", "-m", params.commitMessage], { cwd: params.repoDir });
23070
+ await runGit(["push", "-u", "origin", params.branchName], { cwd: params.repoDir });
23071
+ return true;
23072
+ }
23073
+ async function pushResultsRepoBranch(config, branchName, cwd) {
23074
+ const normalized = normalizeResultsExportConfig(config);
23075
+ await runGit(["push", "-u", "origin", branchName], {
23076
+ cwd: cwd ?? getResultsRepoCachePaths(normalized.repo).repoDir
23077
+ });
23078
+ updateStatusFile(normalized, {
23079
+ last_synced_at: (/* @__PURE__ */ new Date()).toISOString(),
23080
+ last_error: void 0
23081
+ });
23082
+ }
23083
+ async function createDraftResultsPr(params) {
23084
+ const { stdout } = await runGh(
23085
+ [
23086
+ "pr",
23087
+ "create",
23088
+ "--draft",
23089
+ "--repo",
23090
+ params.repo,
23091
+ "--base",
23092
+ params.baseBranch,
23093
+ "--head",
23094
+ params.branchName,
23095
+ "--title",
23096
+ params.title,
23097
+ "--body",
23098
+ params.body
23099
+ ],
23100
+ { cwd: params.repoDir }
23101
+ );
23102
+ return stdout.trim();
23103
+ }
23104
+
23105
+ // src/projects.ts
23106
+ init_cjs_shims();
23107
+ var import_node_fs19 = require("fs");
23108
+ var import_node_path54 = __toESM(require("path"), 1);
23109
+ var import_yaml10 = require("yaml");
22329
23110
  function getProjectsRegistryPath() {
22330
- return import_node_path53.default.join(getAgentvHome(), "projects.yaml");
23111
+ return import_node_path54.default.join(getAgentvHome(), "projects.yaml");
22331
23112
  }
22332
23113
  function loadProjectRegistry() {
22333
23114
  const registryPath = getProjectsRegistryPath();
22334
- if (!(0, import_node_fs18.existsSync)(registryPath)) {
23115
+ if (!(0, import_node_fs19.existsSync)(registryPath)) {
22335
23116
  return { projects: [] };
22336
23117
  }
22337
23118
  try {
22338
- const raw = (0, import_node_fs18.readFileSync)(registryPath, "utf-8");
22339
- const parsed = (0, import_yaml9.parse)(raw);
23119
+ const raw = (0, import_node_fs19.readFileSync)(registryPath, "utf-8");
23120
+ const parsed = (0, import_yaml10.parse)(raw);
22340
23121
  if (!parsed || !Array.isArray(parsed.projects)) {
22341
23122
  return { projects: [] };
22342
23123
  }
@@ -22347,14 +23128,14 @@ function loadProjectRegistry() {
22347
23128
  }
22348
23129
  function saveProjectRegistry(registry) {
22349
23130
  const registryPath = getProjectsRegistryPath();
22350
- const dir = import_node_path53.default.dirname(registryPath);
22351
- if (!(0, import_node_fs18.existsSync)(dir)) {
22352
- (0, import_node_fs18.mkdirSync)(dir, { recursive: true });
23131
+ const dir = import_node_path54.default.dirname(registryPath);
23132
+ if (!(0, import_node_fs19.existsSync)(dir)) {
23133
+ (0, import_node_fs19.mkdirSync)(dir, { recursive: true });
22353
23134
  }
22354
- (0, import_node_fs18.writeFileSync)(registryPath, (0, import_yaml9.stringify)(registry), "utf-8");
23135
+ (0, import_node_fs19.writeFileSync)(registryPath, (0, import_yaml10.stringify)(registry), "utf-8");
22355
23136
  }
22356
23137
  function deriveProjectId(dirPath, existingIds) {
22357
- const base = import_node_path53.default.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
23138
+ const base = import_node_path54.default.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
22358
23139
  let candidate = base || "project";
22359
23140
  let suffix = 2;
22360
23141
  while (existingIds.includes(candidate)) {
@@ -22364,11 +23145,11 @@ function deriveProjectId(dirPath, existingIds) {
22364
23145
  return candidate;
22365
23146
  }
22366
23147
  function addProject(projectPath) {
22367
- const absPath = import_node_path53.default.resolve(projectPath);
22368
- if (!(0, import_node_fs18.existsSync)(absPath)) {
23148
+ const absPath = import_node_path54.default.resolve(projectPath);
23149
+ if (!(0, import_node_fs19.existsSync)(absPath)) {
22369
23150
  throw new Error(`Directory not found: ${absPath}`);
22370
23151
  }
22371
- if (!(0, import_node_fs18.existsSync)(import_node_path53.default.join(absPath, ".agentv"))) {
23152
+ if (!(0, import_node_fs19.existsSync)(import_node_path54.default.join(absPath, ".agentv"))) {
22372
23153
  throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
22373
23154
  }
22374
23155
  const registry = loadProjectRegistry();
@@ -22382,7 +23163,7 @@ function addProject(projectPath) {
22382
23163
  absPath,
22383
23164
  registry.projects.map((p) => p.id)
22384
23165
  ),
22385
- name: import_node_path53.default.basename(absPath),
23166
+ name: import_node_path54.default.basename(absPath),
22386
23167
  path: absPath,
22387
23168
  addedAt: now,
22388
23169
  lastOpenedAt: now
@@ -22411,24 +23192,24 @@ function touchProject(projectId) {
22411
23192
  }
22412
23193
  }
22413
23194
  function discoverProjects(rootDir, maxDepth = 2) {
22414
- const absRoot = import_node_path53.default.resolve(rootDir);
22415
- if (!(0, import_node_fs18.existsSync)(absRoot) || !(0, import_node_fs18.statSync)(absRoot).isDirectory()) {
23195
+ const absRoot = import_node_path54.default.resolve(rootDir);
23196
+ if (!(0, import_node_fs19.existsSync)(absRoot) || !(0, import_node_fs19.statSync)(absRoot).isDirectory()) {
22416
23197
  return [];
22417
23198
  }
22418
23199
  const results = [];
22419
23200
  function scan(dir, depth) {
22420
23201
  if (depth > maxDepth) return;
22421
- if ((0, import_node_fs18.existsSync)(import_node_path53.default.join(dir, ".agentv"))) {
23202
+ if ((0, import_node_fs19.existsSync)(import_node_path54.default.join(dir, ".agentv"))) {
22422
23203
  results.push(dir);
22423
23204
  return;
22424
23205
  }
22425
23206
  if (depth === maxDepth) return;
22426
23207
  try {
22427
- const entries = (0, import_node_fs18.readdirSync)(dir, { withFileTypes: true });
23208
+ const entries = (0, import_node_fs19.readdirSync)(dir, { withFileTypes: true });
22428
23209
  for (const entry of entries) {
22429
23210
  if (!entry.isDirectory()) continue;
22430
23211
  if (entry.name.startsWith(".") || entry.name === "node_modules") continue;
22431
- scan(import_node_path53.default.join(dir, entry.name), depth + 1);
23212
+ scan(import_node_path54.default.join(dir, entry.name), depth + 1);
22432
23213
  }
22433
23214
  } catch {
22434
23215
  }
@@ -23354,33 +24135,33 @@ function extractResponseItemContent(content) {
23354
24135
 
23355
24136
  // src/import/codex-session-discovery.ts
23356
24137
  init_cjs_shims();
23357
- var import_promises38 = require("fs/promises");
23358
- var import_node_os8 = require("os");
23359
- var import_node_path55 = __toESM(require("path"), 1);
23360
- var DEFAULT_SESSIONS_DIR = () => import_node_path55.default.join((0, import_node_os8.homedir)(), ".codex", "sessions");
24138
+ var import_promises40 = require("fs/promises");
24139
+ var import_node_os9 = require("os");
24140
+ var import_node_path56 = __toESM(require("path"), 1);
24141
+ var DEFAULT_SESSIONS_DIR = () => import_node_path56.default.join((0, import_node_os9.homedir)(), ".codex", "sessions");
23361
24142
  async function discoverCodexSessions(opts) {
23362
24143
  const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
23363
24144
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
23364
24145
  const sessions = [];
23365
24146
  let yearDirs;
23366
24147
  try {
23367
- yearDirs = await (0, import_promises38.readdir)(sessionsDir);
24148
+ yearDirs = await (0, import_promises40.readdir)(sessionsDir);
23368
24149
  } catch {
23369
24150
  return [];
23370
24151
  }
23371
24152
  for (const year of yearDirs) {
23372
- const yearPath = import_node_path55.default.join(sessionsDir, year);
24153
+ const yearPath = import_node_path56.default.join(sessionsDir, year);
23373
24154
  let monthDirs;
23374
24155
  try {
23375
- monthDirs = await (0, import_promises38.readdir)(yearPath);
24156
+ monthDirs = await (0, import_promises40.readdir)(yearPath);
23376
24157
  } catch {
23377
24158
  continue;
23378
24159
  }
23379
24160
  for (const month of monthDirs) {
23380
- const monthPath = import_node_path55.default.join(yearPath, month);
24161
+ const monthPath = import_node_path56.default.join(yearPath, month);
23381
24162
  let dayDirs;
23382
24163
  try {
23383
- dayDirs = await (0, import_promises38.readdir)(monthPath);
24164
+ dayDirs = await (0, import_promises40.readdir)(monthPath);
23384
24165
  } catch {
23385
24166
  continue;
23386
24167
  }
@@ -23389,22 +24170,22 @@ async function discoverCodexSessions(opts) {
23389
24170
  const dirDate = `${year}-${month}-${day}`;
23390
24171
  if (dirDate !== opts.date) continue;
23391
24172
  }
23392
- const dayPath = import_node_path55.default.join(monthPath, day);
24173
+ const dayPath = import_node_path56.default.join(monthPath, day);
23393
24174
  let files;
23394
24175
  try {
23395
- files = await (0, import_promises38.readdir)(dayPath);
24176
+ files = await (0, import_promises40.readdir)(dayPath);
23396
24177
  } catch {
23397
24178
  continue;
23398
24179
  }
23399
24180
  for (const file of files) {
23400
24181
  if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
23401
- const filePath = import_node_path55.default.join(dayPath, file);
24182
+ const filePath = import_node_path56.default.join(dayPath, file);
23402
24183
  const nameWithoutExt = file.replace(/\.jsonl$/, "");
23403
24184
  const parts = nameWithoutExt.split("-");
23404
24185
  const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
23405
24186
  let updatedAt;
23406
24187
  try {
23407
- const fileStat = await (0, import_promises38.stat)(filePath);
24188
+ const fileStat = await (0, import_promises40.stat)(filePath);
23408
24189
  updatedAt = fileStat.mtime;
23409
24190
  } catch {
23410
24191
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -23420,10 +24201,10 @@ async function discoverCodexSessions(opts) {
23420
24201
 
23421
24202
  // src/import/session-discovery.ts
23422
24203
  init_cjs_shims();
23423
- var import_promises39 = require("fs/promises");
23424
- var import_node_os9 = require("os");
23425
- var import_node_path56 = __toESM(require("path"), 1);
23426
- var DEFAULT_PROJECTS_DIR = () => import_node_path56.default.join((0, import_node_os9.homedir)(), ".claude", "projects");
24204
+ var import_promises41 = require("fs/promises");
24205
+ var import_node_os10 = require("os");
24206
+ var import_node_path57 = __toESM(require("path"), 1);
24207
+ var DEFAULT_PROJECTS_DIR = () => import_node_path57.default.join((0, import_node_os10.homedir)(), ".claude", "projects");
23427
24208
  function encodeProjectPath(projectPath) {
23428
24209
  return projectPath.replace(/\//g, "-");
23429
24210
  }
@@ -23432,7 +24213,7 @@ async function discoverClaudeSessions(opts) {
23432
24213
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
23433
24214
  let projectDirs;
23434
24215
  try {
23435
- projectDirs = await (0, import_promises39.readdir)(projectsDir);
24216
+ projectDirs = await (0, import_promises41.readdir)(projectsDir);
23436
24217
  } catch {
23437
24218
  return [];
23438
24219
  }
@@ -23442,10 +24223,10 @@ async function discoverClaudeSessions(opts) {
23442
24223
  }
23443
24224
  const sessions = [];
23444
24225
  for (const projectDir of projectDirs) {
23445
- const dirPath = import_node_path56.default.join(projectsDir, projectDir);
24226
+ const dirPath = import_node_path57.default.join(projectsDir, projectDir);
23446
24227
  let entries;
23447
24228
  try {
23448
- entries = await (0, import_promises39.readdir)(dirPath);
24229
+ entries = await (0, import_promises41.readdir)(dirPath);
23449
24230
  } catch {
23450
24231
  continue;
23451
24232
  }
@@ -23453,10 +24234,10 @@ async function discoverClaudeSessions(opts) {
23453
24234
  if (!entry.endsWith(".jsonl")) continue;
23454
24235
  const sessionId = entry.replace(/\.jsonl$/, "");
23455
24236
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
23456
- const filePath = import_node_path56.default.join(dirPath, entry);
24237
+ const filePath = import_node_path57.default.join(dirPath, entry);
23457
24238
  let updatedAt;
23458
24239
  try {
23459
- const fileStat = await (0, import_promises39.stat)(filePath);
24240
+ const fileStat = await (0, import_promises41.stat)(filePath);
23460
24241
  updatedAt = fileStat.mtime;
23461
24242
  } catch {
23462
24243
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -23478,7 +24259,7 @@ init_cjs_shims();
23478
24259
 
23479
24260
  // src/import/types.ts
23480
24261
  init_cjs_shims();
23481
- var import_promises40 = require("fs/promises");
24262
+ var import_promises42 = require("fs/promises");
23482
24263
  function toTranscriptJsonLine(entry) {
23483
24264
  const firstUserMessage = entry.messages.find((m) => m.role === "user");
23484
24265
  const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
@@ -23504,11 +24285,11 @@ function toTranscriptJsonLine(entry) {
23504
24285
  };
23505
24286
  }
23506
24287
  async function readTranscriptJsonl(filePath) {
23507
- const text = await (0, import_promises40.readFile)(filePath, "utf8");
24288
+ const text = await (0, import_promises42.readFile)(filePath, "utf8");
23508
24289
  return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
23509
24290
  }
23510
24291
  async function readTranscriptFile(filePath) {
23511
- return (0, import_promises40.readFile)(filePath, "utf8");
24292
+ return (0, import_promises42.readFile)(filePath, "utf8");
23512
24293
  }
23513
24294
 
23514
24295
  // src/import/transcript-provider.ts
@@ -23574,6 +24355,7 @@ function createAgentKernel() {
23574
24355
  DEFAULT_EXPLORATION_TOOLS,
23575
24356
  DEFAULT_THRESHOLD,
23576
24357
  DeterministicAssertionEvaluator,
24358
+ DockerWorkspaceProvider,
23577
24359
  EvaluatorRegistry,
23578
24360
  ExecutionMetricsEvaluator,
23579
24361
  FieldAccuracyEvaluator,
@@ -23609,9 +24391,11 @@ function createAgentKernel() {
23609
24391
  buildSearchRoots,
23610
24392
  calculateRubricScore,
23611
24393
  captureFileChanges,
24394
+ checkoutResultsRepoBranch,
23612
24395
  clampScore,
23613
24396
  cleanupEvalWorkspaces,
23614
24397
  cleanupWorkspace,
24398
+ commitAndPushResultsBranch,
23615
24399
  computeTraceSummary,
23616
24400
  computeWorkspaceFingerprint,
23617
24401
  consumeClaudeLogEntries,
@@ -23622,6 +24406,7 @@ function createAgentKernel() {
23622
24406
  createAgentKernel,
23623
24407
  createBuiltinProviderRegistry,
23624
24408
  createBuiltinRegistry,
24409
+ createDraftResultsPr,
23625
24410
  createProvider,
23626
24411
  createTempWorkspace,
23627
24412
  deepEqual,
@@ -23629,6 +24414,7 @@ function createAgentKernel() {
23629
24414
  deriveCategory,
23630
24415
  deriveProjectId,
23631
24416
  detectFormat,
24417
+ directorySizeBytes,
23632
24418
  discoverAssertions,
23633
24419
  discoverClaudeSessions,
23634
24420
  discoverCodexSessions,
@@ -23637,6 +24423,7 @@ function createAgentKernel() {
23637
24423
  discoverJudges,
23638
24424
  discoverProjects,
23639
24425
  discoverProviders,
24426
+ ensureResultsRepoClone,
23640
24427
  ensureVSCodeSubagents,
23641
24428
  evaluate,
23642
24429
  executeScript,
@@ -23661,6 +24448,8 @@ function createAgentKernel() {
23661
24448
  getOutputFilenames,
23662
24449
  getProject,
23663
24450
  getProjectsRegistryPath,
24451
+ getResultsRepoCachePaths,
24452
+ getResultsRepoStatus,
23664
24453
  getSubagentsRoot,
23665
24454
  getTextContent,
23666
24455
  getTraceStateRoot,
@@ -23690,12 +24479,15 @@ function createAgentKernel() {
23690
24479
  mergeExecutionMetrics,
23691
24480
  negateScore,
23692
24481
  normalizeLineEndings,
24482
+ normalizeResultsExportConfig,
23693
24483
  parseAgentSkillsEvals,
23694
24484
  parseClaudeSession,
23695
24485
  parseCodexSession,
23696
24486
  parseCopilotEvents,
23697
24487
  parseJsonFromText,
23698
24488
  parseJsonSafe,
24489
+ prepareResultsRepoBranch,
24490
+ pushResultsRepoBranch,
23699
24491
  readJsonFile,
23700
24492
  readTargetDefinitions,
23701
24493
  readTestSuiteMetadata,
@@ -23706,6 +24498,8 @@ function createAgentKernel() {
23706
24498
  resolveAndCreateProvider,
23707
24499
  resolveDelegatedTargetDefinition,
23708
24500
  resolveFileReference,
24501
+ resolveResultsRepoRunsDir,
24502
+ resolveResultsRepoUrl,
23709
24503
  resolveTargetDefinition,
23710
24504
  resolveWorkspaceTemplate,
23711
24505
  rubricEvaluationSchema,
@@ -23727,12 +24521,14 @@ function createAgentKernel() {
23727
24521
  scoreToVerdict,
23728
24522
  shouldEnableCache,
23729
24523
  shouldSkipCacheForTemperature,
24524
+ stageResultsArtifacts,
23730
24525
  subscribeToClaudeLogEntries,
23731
24526
  subscribeToCodexLogEntries,
23732
24527
  subscribeToCopilotCliLogEntries,
23733
24528
  subscribeToCopilotSdkLogEntries,
23734
24529
  subscribeToPiLogEntries,
23735
24530
  substituteVariables,
24531
+ syncResultsRepo,
23736
24532
  toCamelCaseDeep,
23737
24533
  toSnakeCaseDeep,
23738
24534
  toTranscriptJsonLine,