@agentv/core 2.15.0 → 2.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-N55K52OO.js → chunk-CPPYERD2.js} +1 -1
- package/dist/chunk-CPPYERD2.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +8 -7
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +9 -8
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +251 -260
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +50 -35
- package/dist/index.d.ts +50 -35
- package/dist/index.js +235 -243
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-N55K52OO.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-CPPYERD2.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -3114,17 +3114,32 @@ function parseRepoConfig(raw) {
|
|
|
3114
3114
|
...clone !== void 0 && { clone }
|
|
3115
3115
|
};
|
|
3116
3116
|
}
|
|
3117
|
-
function
|
|
3117
|
+
function parseWorkspaceHookConfig(raw, evalFileDir) {
|
|
3118
3118
|
if (!isJsonObject(raw)) return void 0;
|
|
3119
|
+
const script = parseWorkspaceScriptConfig(raw, evalFileDir);
|
|
3119
3120
|
const obj = raw;
|
|
3120
|
-
const
|
|
3121
|
-
|
|
3122
|
-
if (!strategy && afterEach === void 0) return void 0;
|
|
3121
|
+
const reset = obj.reset === "none" || obj.reset === "fast" || obj.reset === "strict" ? obj.reset : void 0;
|
|
3122
|
+
if (!script && !reset) return void 0;
|
|
3123
3123
|
return {
|
|
3124
|
-
...
|
|
3125
|
-
...
|
|
3124
|
+
...script ?? {},
|
|
3125
|
+
...reset !== void 0 && { reset }
|
|
3126
3126
|
};
|
|
3127
3127
|
}
|
|
3128
|
+
function parseWorkspaceHooksConfig(raw, evalFileDir) {
|
|
3129
|
+
if (!isJsonObject(raw)) return void 0;
|
|
3130
|
+
const obj = raw;
|
|
3131
|
+
const beforeAll = parseWorkspaceHookConfig(obj.before_all, evalFileDir);
|
|
3132
|
+
const beforeEach = parseWorkspaceHookConfig(obj.before_each, evalFileDir);
|
|
3133
|
+
const afterEach = parseWorkspaceHookConfig(obj.after_each, evalFileDir);
|
|
3134
|
+
const afterAll = parseWorkspaceHookConfig(obj.after_all, evalFileDir);
|
|
3135
|
+
const hooks = {
|
|
3136
|
+
...beforeAll !== void 0 && { before_all: beforeAll },
|
|
3137
|
+
...beforeEach !== void 0 && { before_each: beforeEach },
|
|
3138
|
+
...afterEach !== void 0 && { after_each: afterEach },
|
|
3139
|
+
...afterAll !== void 0 && { after_all: afterAll }
|
|
3140
|
+
};
|
|
3141
|
+
return Object.keys(hooks).length > 0 ? hooks : void 0;
|
|
3142
|
+
}
|
|
3128
3143
|
async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
3129
3144
|
if (typeof raw === "string") {
|
|
3130
3145
|
const workspaceFilePath = path8.resolve(evalFileDir, raw);
|
|
@@ -3154,37 +3169,48 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
3154
3169
|
}
|
|
3155
3170
|
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
3156
3171
|
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
3157
|
-
const
|
|
3158
|
-
const
|
|
3159
|
-
const
|
|
3160
|
-
const
|
|
3161
|
-
|
|
3162
|
-
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
3172
|
+
const hooks = parseWorkspaceHooksConfig(obj.hooks, evalFileDir);
|
|
3173
|
+
const mode = obj.mode === "pooled" || obj.mode === "ephemeral" || obj.mode === "static" ? obj.mode : void 0;
|
|
3174
|
+
const staticPath = typeof obj.static_path === "string" ? obj.static_path : void 0;
|
|
3175
|
+
const pool = typeof obj.pool === "boolean" ? obj.pool : void 0;
|
|
3176
|
+
if (!template && !isolation && !repos && !hooks && !mode && !staticPath && pool === void 0)
|
|
3163
3177
|
return void 0;
|
|
3164
3178
|
return {
|
|
3165
3179
|
...template !== void 0 && { template },
|
|
3166
3180
|
...isolation !== void 0 && { isolation },
|
|
3167
3181
|
...repos !== void 0 && { repos },
|
|
3168
|
-
...
|
|
3169
|
-
...
|
|
3170
|
-
...
|
|
3171
|
-
...
|
|
3172
|
-
...afterEach !== void 0 && { after_each: afterEach }
|
|
3182
|
+
...hooks !== void 0 && { hooks },
|
|
3183
|
+
...mode !== void 0 && { mode },
|
|
3184
|
+
...staticPath !== void 0 && { static_path: staticPath },
|
|
3185
|
+
...pool !== void 0 && { pool }
|
|
3173
3186
|
};
|
|
3174
3187
|
}
|
|
3175
3188
|
function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
3176
3189
|
if (!suiteLevel && !caseLevel) return void 0;
|
|
3177
3190
|
if (!suiteLevel) return caseLevel;
|
|
3178
3191
|
if (!caseLevel) return suiteLevel;
|
|
3192
|
+
const mergeHook = (suiteHook, caseHook) => {
|
|
3193
|
+
if (!suiteHook && !caseHook) return void 0;
|
|
3194
|
+
return {
|
|
3195
|
+
...suiteHook ?? {},
|
|
3196
|
+
...caseHook ?? {}
|
|
3197
|
+
};
|
|
3198
|
+
};
|
|
3199
|
+
const mergedHooks = {
|
|
3200
|
+
before_all: mergeHook(suiteLevel.hooks?.before_all, caseLevel.hooks?.before_all),
|
|
3201
|
+
before_each: mergeHook(suiteLevel.hooks?.before_each, caseLevel.hooks?.before_each),
|
|
3202
|
+
after_each: mergeHook(suiteLevel.hooks?.after_each, caseLevel.hooks?.after_each),
|
|
3203
|
+
after_all: mergeHook(suiteLevel.hooks?.after_all, caseLevel.hooks?.after_all)
|
|
3204
|
+
};
|
|
3205
|
+
const hasHooks = Object.values(mergedHooks).some((hook) => hook !== void 0);
|
|
3179
3206
|
return {
|
|
3180
3207
|
template: caseLevel.template ?? suiteLevel.template,
|
|
3181
3208
|
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
3182
3209
|
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
after_each: caseLevel.after_each ?? suiteLevel.after_each
|
|
3210
|
+
...hasHooks && { hooks: mergedHooks },
|
|
3211
|
+
mode: caseLevel.mode ?? suiteLevel.mode,
|
|
3212
|
+
static_path: caseLevel.static_path ?? suiteLevel.static_path,
|
|
3213
|
+
pool: caseLevel.pool ?? suiteLevel.pool
|
|
3188
3214
|
};
|
|
3189
3215
|
}
|
|
3190
3216
|
function asString6(value) {
|
|
@@ -5304,6 +5330,7 @@ var CopilotCliProvider = class {
|
|
|
5304
5330
|
const agentProcess = spawn(executable, args, {
|
|
5305
5331
|
stdio: ["pipe", "pipe", "inherit"]
|
|
5306
5332
|
});
|
|
5333
|
+
await waitForProcessSpawn(agentProcess, executable, this.targetName);
|
|
5307
5334
|
const toolCallsInProgress = /* @__PURE__ */ new Map();
|
|
5308
5335
|
const completedToolCalls = [];
|
|
5309
5336
|
let finalContent = "";
|
|
@@ -5583,6 +5610,47 @@ var CopilotCliProvider = class {
|
|
|
5583
5610
|
}
|
|
5584
5611
|
}
|
|
5585
5612
|
};
|
|
5613
|
+
async function waitForProcessSpawn(proc, executable, targetName) {
|
|
5614
|
+
if (proc.pid) {
|
|
5615
|
+
return;
|
|
5616
|
+
}
|
|
5617
|
+
await new Promise((resolve, reject) => {
|
|
5618
|
+
const onSpawn = () => {
|
|
5619
|
+
cleanup();
|
|
5620
|
+
resolve();
|
|
5621
|
+
};
|
|
5622
|
+
const onError = (error) => {
|
|
5623
|
+
cleanup();
|
|
5624
|
+
reject(new Error(formatCopilotSpawnError(error, executable, targetName)));
|
|
5625
|
+
};
|
|
5626
|
+
const cleanup = () => {
|
|
5627
|
+
proc.off("spawn", onSpawn);
|
|
5628
|
+
proc.off("error", onError);
|
|
5629
|
+
};
|
|
5630
|
+
proc.once("spawn", onSpawn);
|
|
5631
|
+
proc.once("error", onError);
|
|
5632
|
+
});
|
|
5633
|
+
}
|
|
5634
|
+
function formatCopilotSpawnError(error, executable, targetName) {
|
|
5635
|
+
const code = error.code;
|
|
5636
|
+
const base = `Failed to start Copilot CLI executable '${executable}' for target '${targetName}'. ${error.message}`;
|
|
5637
|
+
if (process.platform !== "win32") {
|
|
5638
|
+
return base;
|
|
5639
|
+
}
|
|
5640
|
+
if (code !== "ENOENT" && code !== "EINVAL") {
|
|
5641
|
+
return base;
|
|
5642
|
+
}
|
|
5643
|
+
return `${base}
|
|
5644
|
+
|
|
5645
|
+
On Windows, shell commands like 'copilot -h' can work via .ps1/.bat shims, but AgentV launches a subprocess that needs a directly spawnable executable path.
|
|
5646
|
+
|
|
5647
|
+
Fix options:
|
|
5648
|
+
1) Install native Copilot binary package:
|
|
5649
|
+
npm install -g @github/copilot-win32-x64
|
|
5650
|
+
2) Set explicit executable for Copilot targets:
|
|
5651
|
+
- In .env: COPILOT_EXE=C:\\Users\\<you>\\AppData\\Roaming\\npm\\node_modules\\@github\\copilot-win32-x64\\copilot.exe
|
|
5652
|
+
- In .agentv/targets.yaml: executable: \${{ COPILOT_EXE }}`;
|
|
5653
|
+
}
|
|
5586
5654
|
function summarizeAcpEvent(eventType, data) {
|
|
5587
5655
|
if (!data || typeof data !== "object") {
|
|
5588
5656
|
return eventType;
|
|
@@ -7238,9 +7306,6 @@ function getAgentvHome() {
|
|
|
7238
7306
|
function getWorkspacesRoot() {
|
|
7239
7307
|
return path21.join(getAgentvHome(), "workspaces");
|
|
7240
7308
|
}
|
|
7241
|
-
function getGitCacheRoot() {
|
|
7242
|
-
return path21.join(getAgentvHome(), "git-cache");
|
|
7243
|
-
}
|
|
7244
7309
|
function getSubagentsRoot() {
|
|
7245
7310
|
return path21.join(getAgentvHome(), "subagents");
|
|
7246
7311
|
}
|
|
@@ -8700,16 +8765,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
8700
8765
|
});
|
|
8701
8766
|
}
|
|
8702
8767
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
8703
|
-
const { mkdir:
|
|
8768
|
+
const { mkdir: mkdir14, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
8704
8769
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
8705
8770
|
const path42 = await import("node:path");
|
|
8706
8771
|
const { randomUUID: randomUUID8 } = await import("node:crypto");
|
|
8707
8772
|
const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
8708
|
-
await
|
|
8773
|
+
await mkdir14(dir, { recursive: true });
|
|
8709
8774
|
const stdinPath = path42.join(dir, "stdin.txt");
|
|
8710
8775
|
const stdoutPath = path42.join(dir, "stdout.txt");
|
|
8711
8776
|
const stderrPath = path42.join(dir, "stderr.txt");
|
|
8712
|
-
await
|
|
8777
|
+
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
8713
8778
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
8714
8779
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
8715
8780
|
try {
|
|
@@ -8742,7 +8807,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
8742
8807
|
const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
8743
8808
|
return { stdout, stderr, exitCode };
|
|
8744
8809
|
} finally {
|
|
8745
|
-
await
|
|
8810
|
+
await rm6(dir, { recursive: true, force: true });
|
|
8746
8811
|
}
|
|
8747
8812
|
}
|
|
8748
8813
|
|
|
@@ -11930,8 +11995,8 @@ function runEqualsAssertion(output, value) {
|
|
|
11930
11995
|
}
|
|
11931
11996
|
|
|
11932
11997
|
// src/evaluation/orchestrator.ts
|
|
11933
|
-
import { createHash as
|
|
11934
|
-
import { mkdir as
|
|
11998
|
+
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
11999
|
+
import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
|
|
11935
12000
|
import path39 from "node:path";
|
|
11936
12001
|
import micromatch4 from "micromatch";
|
|
11937
12002
|
|
|
@@ -12894,7 +12959,7 @@ var WorkspacePoolManager = class {
|
|
|
12894
12959
|
* 7. Return the slot (with path, index, isExisting)
|
|
12895
12960
|
*/
|
|
12896
12961
|
async acquireWorkspace(options) {
|
|
12897
|
-
const { templatePath, repos, maxSlots, repoManager } = options;
|
|
12962
|
+
const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
|
|
12898
12963
|
const fingerprint = computeWorkspaceFingerprint(templatePath, repos);
|
|
12899
12964
|
const poolDir = path36.join(this.poolRoot, fingerprint);
|
|
12900
12965
|
await mkdir11(poolDir, { recursive: true });
|
|
@@ -12914,7 +12979,7 @@ var WorkspacePoolManager = class {
|
|
|
12914
12979
|
}
|
|
12915
12980
|
const slotExists = existsSync2(slotPath);
|
|
12916
12981
|
if (slotExists) {
|
|
12917
|
-
await this.resetSlot(slotPath, templatePath, repos);
|
|
12982
|
+
await this.resetSlot(slotPath, templatePath, repos, poolReset);
|
|
12918
12983
|
return {
|
|
12919
12984
|
index: i,
|
|
12920
12985
|
path: slotPath,
|
|
@@ -13046,15 +13111,19 @@ var WorkspacePoolManager = class {
|
|
|
13046
13111
|
* 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
|
|
13047
13112
|
* 2. Re-copy template files (skip repo directories)
|
|
13048
13113
|
*/
|
|
13049
|
-
async resetSlot(slotPath, templatePath, repos) {
|
|
13114
|
+
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
13050
13115
|
for (const repo of repos) {
|
|
13051
13116
|
const repoDir = path36.join(slotPath, repo.path);
|
|
13052
13117
|
if (!existsSync2(repoDir)) {
|
|
13053
13118
|
continue;
|
|
13054
13119
|
}
|
|
13120
|
+
if (poolReset === "none") {
|
|
13121
|
+
continue;
|
|
13122
|
+
}
|
|
13055
13123
|
const ref = repo.checkout?.ref ?? "HEAD";
|
|
13056
13124
|
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
13057
|
-
|
|
13125
|
+
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
13126
|
+
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
13058
13127
|
}
|
|
13059
13128
|
if (templatePath) {
|
|
13060
13129
|
const repoDirNames = new Set(
|
|
@@ -13070,14 +13139,10 @@ var WorkspacePoolManager = class {
|
|
|
13070
13139
|
|
|
13071
13140
|
// src/evaluation/workspace/repo-manager.ts
|
|
13072
13141
|
import { execFile as execFile2 } from "node:child_process";
|
|
13073
|
-
import { createHash as createHash2 } from "node:crypto";
|
|
13074
|
-
import { existsSync as existsSync3 } from "node:fs";
|
|
13075
|
-
import { mkdir as mkdir12, rm as rm6, unlink as unlink2, writeFile as writeFile8 } from "node:fs/promises";
|
|
13076
13142
|
import path37 from "node:path";
|
|
13077
13143
|
import { promisify as promisify6 } from "node:util";
|
|
13078
13144
|
var execFileAsync2 = promisify6(execFile2);
|
|
13079
13145
|
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
13080
|
-
var LOCK_TIMEOUT_MS = 6e4;
|
|
13081
13146
|
function gitEnv2() {
|
|
13082
13147
|
const env = { ...process.env };
|
|
13083
13148
|
for (const key of Object.keys(env)) {
|
|
@@ -13092,10 +13157,6 @@ function gitEnv2() {
|
|
|
13092
13157
|
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
13093
13158
|
};
|
|
13094
13159
|
}
|
|
13095
|
-
function cacheKey(source) {
|
|
13096
|
-
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
13097
|
-
return createHash2("sha256").update(raw).digest("hex");
|
|
13098
|
-
}
|
|
13099
13160
|
function getSourceUrl(source) {
|
|
13100
13161
|
return source.type === "git" ? source.url : source.path;
|
|
13101
13162
|
}
|
|
@@ -13109,33 +13170,9 @@ async function git2(args, opts) {
|
|
|
13109
13170
|
});
|
|
13110
13171
|
return stdout.trim();
|
|
13111
13172
|
}
|
|
13112
|
-
async function acquireLock(lockPath) {
|
|
13113
|
-
const start = Date.now();
|
|
13114
|
-
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
13115
|
-
try {
|
|
13116
|
-
await writeFile8(lockPath, String(process.pid), { flag: "wx" });
|
|
13117
|
-
return;
|
|
13118
|
-
} catch (err) {
|
|
13119
|
-
if (err.code === "EEXIST") {
|
|
13120
|
-
await new Promise((r) => setTimeout(r, 200));
|
|
13121
|
-
continue;
|
|
13122
|
-
}
|
|
13123
|
-
throw err;
|
|
13124
|
-
}
|
|
13125
|
-
}
|
|
13126
|
-
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
13127
|
-
}
|
|
13128
|
-
async function releaseLock(lockPath) {
|
|
13129
|
-
try {
|
|
13130
|
-
await unlink2(lockPath);
|
|
13131
|
-
} catch {
|
|
13132
|
-
}
|
|
13133
|
-
}
|
|
13134
13173
|
var RepoManager = class {
|
|
13135
|
-
cacheDir;
|
|
13136
13174
|
verbose;
|
|
13137
|
-
constructor(
|
|
13138
|
-
this.cacheDir = cacheDir ?? getGitCacheRoot();
|
|
13175
|
+
constructor(verbose = false) {
|
|
13139
13176
|
this.verbose = verbose;
|
|
13140
13177
|
}
|
|
13141
13178
|
async runGit(args, opts) {
|
|
@@ -13160,86 +13197,18 @@ var RepoManager = class {
|
|
|
13160
13197
|
}
|
|
13161
13198
|
}
|
|
13162
13199
|
/**
|
|
13163
|
-
*
|
|
13164
|
-
* Creates on first access, fetches updates on subsequent calls.
|
|
13165
|
-
* Returns the absolute path to the cache directory.
|
|
13166
|
-
*/
|
|
13167
|
-
async ensureCache(source, depth, resolve) {
|
|
13168
|
-
const key = cacheKey(source);
|
|
13169
|
-
const cachePath = path37.join(this.cacheDir, key);
|
|
13170
|
-
const lockPath = `${cachePath}.lock`;
|
|
13171
|
-
const cacheExists = existsSync3(path37.join(cachePath, "HEAD"));
|
|
13172
|
-
if (this.verbose) {
|
|
13173
|
-
console.log(
|
|
13174
|
-
`[repo] ensureCache source=${getSourceUrl(source)} resolve=${resolve ?? "remote"} cache=${cachePath}`
|
|
13175
|
-
);
|
|
13176
|
-
}
|
|
13177
|
-
if (resolve === "local") {
|
|
13178
|
-
if (cacheExists) {
|
|
13179
|
-
if (this.verbose) {
|
|
13180
|
-
console.log(`[repo] using existing local cache ${cachePath}`);
|
|
13181
|
-
}
|
|
13182
|
-
return cachePath;
|
|
13183
|
-
}
|
|
13184
|
-
const url = getSourceUrl(source);
|
|
13185
|
-
throw new Error(
|
|
13186
|
-
`No cache found for \`${url}\`. Run \`agentv cache add --url ${url} --from <local-path>\` to seed it.`
|
|
13187
|
-
);
|
|
13188
|
-
}
|
|
13189
|
-
await mkdir12(this.cacheDir, { recursive: true });
|
|
13190
|
-
const lockStartedAt = Date.now();
|
|
13191
|
-
await acquireLock(lockPath);
|
|
13192
|
-
if (this.verbose) {
|
|
13193
|
-
console.log(`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`);
|
|
13194
|
-
}
|
|
13195
|
-
try {
|
|
13196
|
-
if (cacheExists) {
|
|
13197
|
-
if (this.verbose) {
|
|
13198
|
-
console.log(`[repo] refreshing existing cache ${cachePath}`);
|
|
13199
|
-
}
|
|
13200
|
-
const fetchArgs = ["fetch", "--prune"];
|
|
13201
|
-
if (depth) {
|
|
13202
|
-
fetchArgs.push("--depth", String(depth));
|
|
13203
|
-
}
|
|
13204
|
-
await this.runGit(fetchArgs, { cwd: cachePath });
|
|
13205
|
-
} else {
|
|
13206
|
-
if (this.verbose) {
|
|
13207
|
-
console.log(`[repo] creating new cache ${cachePath}`);
|
|
13208
|
-
}
|
|
13209
|
-
const cloneArgs = ["clone", "--mirror", "--bare"];
|
|
13210
|
-
if (depth) {
|
|
13211
|
-
cloneArgs.push("--depth", String(depth));
|
|
13212
|
-
}
|
|
13213
|
-
const sourceUrl = getSourceUrl(source);
|
|
13214
|
-
const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
13215
|
-
cloneArgs.push(cloneUrl, cachePath);
|
|
13216
|
-
await this.runGit(cloneArgs);
|
|
13217
|
-
}
|
|
13218
|
-
} finally {
|
|
13219
|
-
await releaseLock(lockPath);
|
|
13220
|
-
if (this.verbose) {
|
|
13221
|
-
console.log(`[repo] lock released path=${lockPath}`);
|
|
13222
|
-
}
|
|
13223
|
-
}
|
|
13224
|
-
return cachePath;
|
|
13225
|
-
}
|
|
13226
|
-
/**
|
|
13227
|
-
* Clone a repo from cache into the workspace at the configured path.
|
|
13200
|
+
* Clone a repo directly from source into the workspace at the configured path.
|
|
13228
13201
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
13229
13202
|
*/
|
|
13230
13203
|
async materialize(repo, workspacePath) {
|
|
13231
13204
|
const targetDir = path37.join(workspacePath, repo.path);
|
|
13205
|
+
const sourceUrl = getSourceUrl(repo.source);
|
|
13232
13206
|
const startedAt = Date.now();
|
|
13233
13207
|
if (this.verbose) {
|
|
13234
13208
|
console.log(
|
|
13235
|
-
`[repo] materialize start path=${repo.path} source=${
|
|
13209
|
+
`[repo] materialize start path=${repo.path} source=${sourceUrl} workspace=${workspacePath}`
|
|
13236
13210
|
);
|
|
13237
13211
|
}
|
|
13238
|
-
const cachePath = await this.ensureCache(
|
|
13239
|
-
repo.source,
|
|
13240
|
-
repo.clone?.depth,
|
|
13241
|
-
repo.checkout?.resolve
|
|
13242
|
-
);
|
|
13243
13212
|
const cloneArgs = ["clone"];
|
|
13244
13213
|
if (repo.clone?.depth) {
|
|
13245
13214
|
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
@@ -13248,7 +13217,7 @@ var RepoManager = class {
|
|
|
13248
13217
|
cloneArgs.push("--filter", repo.clone.filter);
|
|
13249
13218
|
}
|
|
13250
13219
|
cloneArgs.push("--no-checkout");
|
|
13251
|
-
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${
|
|
13220
|
+
const cloneUrl = (repo.clone?.depth || repo.clone?.filter) && repo.source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
13252
13221
|
cloneArgs.push(cloneUrl, targetDir);
|
|
13253
13222
|
await this.runGit(cloneArgs);
|
|
13254
13223
|
if (repo.clone?.sparse?.length) {
|
|
@@ -13320,51 +13289,13 @@ var RepoManager = class {
|
|
|
13320
13289
|
}
|
|
13321
13290
|
}
|
|
13322
13291
|
/** Reset repos in workspace to their checkout state. */
|
|
13323
|
-
async reset(repos, workspacePath,
|
|
13324
|
-
|
|
13325
|
-
for (const repo of repos) {
|
|
13326
|
-
const targetDir = path37.join(workspacePath, repo.path);
|
|
13327
|
-
await rm6(targetDir, { recursive: true, force: true });
|
|
13328
|
-
}
|
|
13329
|
-
await this.materializeAll(repos, workspacePath);
|
|
13330
|
-
return;
|
|
13331
|
-
}
|
|
13292
|
+
async reset(repos, workspacePath, reset) {
|
|
13293
|
+
const cleanFlag = reset === "strict" ? "-fdx" : "-fd";
|
|
13332
13294
|
for (const repo of repos) {
|
|
13333
13295
|
const targetDir = path37.join(workspacePath, repo.path);
|
|
13334
13296
|
await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
13335
|
-
await this.runGit(["clean",
|
|
13336
|
-
}
|
|
13337
|
-
}
|
|
13338
|
-
/**
|
|
13339
|
-
* Seed the cache from a local repository, setting the remote to a given URL.
|
|
13340
|
-
* Useful for avoiding slow network clones when a local clone already exists.
|
|
13341
|
-
*/
|
|
13342
|
-
async seedCache(localPath, remoteUrl, opts) {
|
|
13343
|
-
const source = { type: "git", url: remoteUrl };
|
|
13344
|
-
const key = cacheKey(source);
|
|
13345
|
-
const cachePath = path37.join(this.cacheDir, key);
|
|
13346
|
-
const lockPath = `${cachePath}.lock`;
|
|
13347
|
-
await mkdir12(this.cacheDir, { recursive: true });
|
|
13348
|
-
await acquireLock(lockPath);
|
|
13349
|
-
try {
|
|
13350
|
-
if (existsSync3(path37.join(cachePath, "HEAD"))) {
|
|
13351
|
-
if (!opts?.force) {
|
|
13352
|
-
throw new Error(
|
|
13353
|
-
`Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
|
|
13354
|
-
);
|
|
13355
|
-
}
|
|
13356
|
-
await rm6(cachePath, { recursive: true, force: true });
|
|
13357
|
-
}
|
|
13358
|
-
await git2(["clone", "--mirror", "--bare", localPath, cachePath]);
|
|
13359
|
-
await git2(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
|
|
13360
|
-
} finally {
|
|
13361
|
-
await releaseLock(lockPath);
|
|
13297
|
+
await this.runGit(["clean", cleanFlag], { cwd: targetDir });
|
|
13362
13298
|
}
|
|
13363
|
-
return cachePath;
|
|
13364
|
-
}
|
|
13365
|
-
/** Remove the entire cache directory. */
|
|
13366
|
-
async cleanCache() {
|
|
13367
|
-
await rm6(this.cacheDir, { recursive: true, force: true });
|
|
13368
13299
|
}
|
|
13369
13300
|
};
|
|
13370
13301
|
|
|
@@ -13450,6 +13381,22 @@ function classifyQualityStatus(score) {
|
|
|
13450
13381
|
function usesFileReferencePrompt(provider) {
|
|
13451
13382
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
13452
13383
|
}
|
|
13384
|
+
function toScriptConfig(hook, hookName, context) {
|
|
13385
|
+
const command = hook.command ?? hook.script;
|
|
13386
|
+
if (!command || command.length === 0) {
|
|
13387
|
+
throw new Error(`${hookName} hook in ${context} requires command or script`);
|
|
13388
|
+
}
|
|
13389
|
+
return {
|
|
13390
|
+
command,
|
|
13391
|
+
...hook.timeout_ms !== void 0 && { timeout_ms: hook.timeout_ms },
|
|
13392
|
+
...hook.timeoutMs !== void 0 && { timeoutMs: hook.timeoutMs },
|
|
13393
|
+
...hook.cwd !== void 0 && { cwd: hook.cwd },
|
|
13394
|
+
...hook.script !== void 0 && { script: hook.script }
|
|
13395
|
+
};
|
|
13396
|
+
}
|
|
13397
|
+
function hasHookCommand(hook) {
|
|
13398
|
+
return !!(hook?.command && hook.command.length > 0 || hook?.script && hook.script.length > 0);
|
|
13399
|
+
}
|
|
13453
13400
|
function getWorkspaceTemplate(target) {
|
|
13454
13401
|
const config = target.config;
|
|
13455
13402
|
if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
|
|
@@ -13483,7 +13430,12 @@ async function runEvaluation(options) {
|
|
|
13483
13430
|
failOnError,
|
|
13484
13431
|
poolWorkspaces,
|
|
13485
13432
|
poolMaxSlots: configPoolMaxSlots,
|
|
13486
|
-
workspace:
|
|
13433
|
+
workspace: legacyWorkspacePath,
|
|
13434
|
+
workspaceMode,
|
|
13435
|
+
workspacePath,
|
|
13436
|
+
workspaceClean,
|
|
13437
|
+
retainOnSuccess,
|
|
13438
|
+
retainOnFailure
|
|
13487
13439
|
} = options;
|
|
13488
13440
|
let useCache = options.useCache;
|
|
13489
13441
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -13619,13 +13571,22 @@ async function runEvaluation(options) {
|
|
|
13619
13571
|
}
|
|
13620
13572
|
};
|
|
13621
13573
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
13622
|
-
|
|
13574
|
+
const configuredMode = suiteWorkspace?.mode ?? workspaceMode;
|
|
13575
|
+
const configuredStaticPath = suiteWorkspace?.static_path ?? workspacePath ?? legacyWorkspacePath;
|
|
13576
|
+
const useStaticWorkspace = configuredMode === "static" || !!configuredStaticPath && !configuredMode;
|
|
13577
|
+
if (useStaticWorkspace && isPerTestIsolation) {
|
|
13623
13578
|
throw new Error(
|
|
13624
|
-
"
|
|
13579
|
+
"static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
|
|
13625
13580
|
);
|
|
13626
13581
|
}
|
|
13627
|
-
|
|
13628
|
-
|
|
13582
|
+
if (configuredMode === "static" && !configuredStaticPath) {
|
|
13583
|
+
throw new Error("workspace.mode=static requires workspace.static_path or --workspace-path");
|
|
13584
|
+
}
|
|
13585
|
+
const hasSharedWorkspace = !!(useStaticWorkspace || workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
13586
|
+
const poolEnabled = configuredMode === "pooled" ? true : configuredMode === "ephemeral" || useStaticWorkspace ? false : suiteWorkspace?.pool ?? poolWorkspaces ?? true;
|
|
13587
|
+
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
13588
|
+
const resolvedRetainOnSuccess = retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
13589
|
+
const resolvedRetainOnFailure = retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
13629
13590
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
13630
13591
|
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
13631
13592
|
setupLog(
|
|
@@ -13646,20 +13607,21 @@ async function runEvaluation(options) {
|
|
|
13646
13607
|
const availablePoolSlots = [];
|
|
13647
13608
|
const poolSlotBaselines = /* @__PURE__ */ new Map();
|
|
13648
13609
|
const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
|
|
13649
|
-
if (
|
|
13650
|
-
sharedWorkspacePath =
|
|
13651
|
-
setupLog(`using
|
|
13610
|
+
if (useStaticWorkspace && configuredStaticPath) {
|
|
13611
|
+
sharedWorkspacePath = configuredStaticPath;
|
|
13612
|
+
setupLog(`using static workspace: ${configuredStaticPath}`);
|
|
13652
13613
|
} else if (usePool && suiteWorkspace?.repos) {
|
|
13653
13614
|
const slotsNeeded = workers;
|
|
13654
13615
|
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
|
|
13655
13616
|
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
|
|
13656
|
-
const poolRepoManager = new RepoManager(
|
|
13617
|
+
const poolRepoManager = new RepoManager(verbose);
|
|
13657
13618
|
for (let i = 0; i < slotsNeeded; i++) {
|
|
13658
13619
|
const slot = await poolManager.acquireWorkspace({
|
|
13659
13620
|
templatePath: workspaceTemplate,
|
|
13660
13621
|
repos: suiteWorkspace.repos,
|
|
13661
13622
|
maxSlots: poolMaxSlots,
|
|
13662
|
-
repoManager: poolRepoManager
|
|
13623
|
+
repoManager: poolRepoManager,
|
|
13624
|
+
poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? "fast"
|
|
13663
13625
|
});
|
|
13664
13626
|
poolSlots.push(slot);
|
|
13665
13627
|
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
@@ -13679,9 +13641,9 @@ async function runEvaluation(options) {
|
|
|
13679
13641
|
const message = error instanceof Error ? error.message : String(error);
|
|
13680
13642
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
13681
13643
|
}
|
|
13682
|
-
} else if (suiteWorkspace?.
|
|
13644
|
+
} else if (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
13683
13645
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
13684
|
-
await
|
|
13646
|
+
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
13685
13647
|
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
13686
13648
|
}
|
|
13687
13649
|
try {
|
|
@@ -13693,7 +13655,7 @@ async function runEvaluation(options) {
|
|
|
13693
13655
|
} catch {
|
|
13694
13656
|
}
|
|
13695
13657
|
}
|
|
13696
|
-
const repoManager = suiteWorkspace?.repos?.length && !usePool && !
|
|
13658
|
+
const repoManager = suiteWorkspace?.repos?.length && !usePool && !useStaticWorkspace ? new RepoManager(verbose) : void 0;
|
|
13697
13659
|
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
13698
13660
|
setupLog(
|
|
13699
13661
|
`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
|
|
@@ -13703,17 +13665,19 @@ async function runEvaluation(options) {
|
|
|
13703
13665
|
setupLog("shared repo materialization complete");
|
|
13704
13666
|
} catch (error) {
|
|
13705
13667
|
const message = error instanceof Error ? error.message : String(error);
|
|
13706
|
-
if (sharedWorkspacePath && !
|
|
13668
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
13707
13669
|
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13708
13670
|
});
|
|
13709
13671
|
}
|
|
13710
13672
|
throw new Error(`Failed to materialize repos: ${message}`);
|
|
13711
13673
|
}
|
|
13712
13674
|
}
|
|
13713
|
-
|
|
13714
|
-
|
|
13675
|
+
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
13676
|
+
if (sharedWorkspacePath && hasHookCommand(suiteBeforeAllHook)) {
|
|
13677
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
13678
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
13715
13679
|
setupLog(
|
|
13716
|
-
`running shared before_all in cwd=${
|
|
13680
|
+
`running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
13717
13681
|
);
|
|
13718
13682
|
const scriptContext = {
|
|
13719
13683
|
workspacePath: sharedWorkspacePath,
|
|
@@ -13722,18 +13686,22 @@ async function runEvaluation(options) {
|
|
|
13722
13686
|
evalDir
|
|
13723
13687
|
};
|
|
13724
13688
|
try {
|
|
13725
|
-
beforeAllOutput = await executeWorkspaceScript(
|
|
13689
|
+
beforeAllOutput = await executeWorkspaceScript(
|
|
13690
|
+
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
13691
|
+
scriptContext
|
|
13692
|
+
);
|
|
13726
13693
|
setupLog("shared before_all completed");
|
|
13727
13694
|
} catch (error) {
|
|
13728
13695
|
const message = error instanceof Error ? error.message : String(error);
|
|
13729
|
-
if (sharedWorkspacePath && !
|
|
13696
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
13730
13697
|
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13731
13698
|
});
|
|
13732
13699
|
}
|
|
13733
13700
|
throw new Error(`before_all script failed: ${message}`);
|
|
13734
13701
|
}
|
|
13735
13702
|
}
|
|
13736
|
-
if (availablePoolSlots.length > 0 &&
|
|
13703
|
+
if (availablePoolSlots.length > 0 && hasHookCommand(suiteBeforeAllHook)) {
|
|
13704
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
13737
13705
|
for (const slot of availablePoolSlots) {
|
|
13738
13706
|
setupLog(`running before_all on pool slot ${slot.index}`);
|
|
13739
13707
|
const scriptContext = {
|
|
@@ -13743,7 +13711,10 @@ async function runEvaluation(options) {
|
|
|
13743
13711
|
evalDir
|
|
13744
13712
|
};
|
|
13745
13713
|
try {
|
|
13746
|
-
const output = await executeWorkspaceScript(
|
|
13714
|
+
const output = await executeWorkspaceScript(
|
|
13715
|
+
toScriptConfig(beforeAllHook, "before_all", "suite workspace"),
|
|
13716
|
+
scriptContext
|
|
13717
|
+
);
|
|
13747
13718
|
if (!beforeAllOutput) beforeAllOutput = output;
|
|
13748
13719
|
setupLog(`before_all completed on pool slot ${slot.index}`);
|
|
13749
13720
|
} catch (error) {
|
|
@@ -13875,6 +13846,8 @@ async function runEvaluation(options) {
|
|
|
13875
13846
|
evalRunId,
|
|
13876
13847
|
keepWorkspaces,
|
|
13877
13848
|
cleanupWorkspaces,
|
|
13849
|
+
retainOnSuccess: resolvedRetainOnSuccess,
|
|
13850
|
+
retainOnFailure: resolvedRetainOnFailure,
|
|
13878
13851
|
sharedWorkspacePath: testWorkspacePath,
|
|
13879
13852
|
sharedBaselineCommit: testBaselineCommit,
|
|
13880
13853
|
suiteWorkspaceFile,
|
|
@@ -13968,7 +13941,9 @@ async function runEvaluation(options) {
|
|
|
13968
13941
|
}
|
|
13969
13942
|
}
|
|
13970
13943
|
const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
|
|
13971
|
-
|
|
13944
|
+
const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all;
|
|
13945
|
+
if (afterAllWorkspaces.length > 0 && hasHookCommand(suiteAfterAllHook)) {
|
|
13946
|
+
const afterAllHook = suiteAfterAllHook;
|
|
13972
13947
|
for (const wsPath of afterAllWorkspaces) {
|
|
13973
13948
|
const scriptContext = {
|
|
13974
13949
|
workspacePath: wsPath,
|
|
@@ -13978,7 +13953,7 @@ async function runEvaluation(options) {
|
|
|
13978
13953
|
};
|
|
13979
13954
|
try {
|
|
13980
13955
|
const afterAllOutput = await executeWorkspaceScript(
|
|
13981
|
-
|
|
13956
|
+
toScriptConfig(afterAllHook, "after_all", "suite workspace"),
|
|
13982
13957
|
scriptContext,
|
|
13983
13958
|
"warn"
|
|
13984
13959
|
);
|
|
@@ -13989,12 +13964,14 @@ async function runEvaluation(options) {
|
|
|
13989
13964
|
}
|
|
13990
13965
|
}
|
|
13991
13966
|
}
|
|
13992
|
-
if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !
|
|
13967
|
+
if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !useStaticWorkspace) {
|
|
13993
13968
|
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
13994
|
-
if (
|
|
13995
|
-
|
|
13996
|
-
|
|
13997
|
-
|
|
13969
|
+
if (hasFailure) {
|
|
13970
|
+
if (resolvedRetainOnFailure === "cleanup") {
|
|
13971
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13972
|
+
});
|
|
13973
|
+
}
|
|
13974
|
+
} else if (resolvedRetainOnSuccess === "cleanup") {
|
|
13998
13975
|
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13999
13976
|
});
|
|
14000
13977
|
}
|
|
@@ -14188,6 +14165,8 @@ async function runEvalCase(options) {
|
|
|
14188
14165
|
evalRunId,
|
|
14189
14166
|
keepWorkspaces,
|
|
14190
14167
|
cleanupWorkspaces: forceCleanup,
|
|
14168
|
+
retainOnSuccess,
|
|
14169
|
+
retainOnFailure,
|
|
14191
14170
|
sharedWorkspacePath,
|
|
14192
14171
|
sharedBaselineCommit,
|
|
14193
14172
|
suiteWorkspaceFile,
|
|
@@ -14199,10 +14178,10 @@ async function runEvalCase(options) {
|
|
|
14199
14178
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
14200
14179
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
14201
14180
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
14202
|
-
const
|
|
14181
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
14203
14182
|
let cachedResponse;
|
|
14204
|
-
if (
|
|
14205
|
-
cachedResponse = await cache.get(
|
|
14183
|
+
if (cacheKey && cache) {
|
|
14184
|
+
cachedResponse = await cache.get(cacheKey);
|
|
14206
14185
|
}
|
|
14207
14186
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
14208
14187
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -14241,12 +14220,12 @@ async function runEvalCase(options) {
|
|
|
14241
14220
|
}
|
|
14242
14221
|
}
|
|
14243
14222
|
}
|
|
14244
|
-
if (!workspacePath && (evalCase.workspace?.
|
|
14223
|
+
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
14245
14224
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
14246
|
-
await
|
|
14225
|
+
await mkdir12(workspacePath, { recursive: true });
|
|
14247
14226
|
}
|
|
14248
14227
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
14249
|
-
const perCaseRepoManager = new RepoManager(
|
|
14228
|
+
const perCaseRepoManager = new RepoManager(setupDebug);
|
|
14250
14229
|
try {
|
|
14251
14230
|
if (setupDebug) {
|
|
14252
14231
|
console.log(
|
|
@@ -14271,11 +14250,13 @@ async function runEvalCase(options) {
|
|
|
14271
14250
|
);
|
|
14272
14251
|
}
|
|
14273
14252
|
}
|
|
14274
|
-
|
|
14275
|
-
|
|
14253
|
+
const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all;
|
|
14254
|
+
if (workspacePath && hasHookCommand(caseBeforeAllHook)) {
|
|
14255
|
+
const beforeAllHook = caseBeforeAllHook;
|
|
14256
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
14276
14257
|
if (setupDebug) {
|
|
14277
14258
|
console.log(
|
|
14278
|
-
`[setup] test=${evalCase.id} running before_all in cwd=${
|
|
14259
|
+
`[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
14279
14260
|
);
|
|
14280
14261
|
}
|
|
14281
14262
|
const scriptContext = {
|
|
@@ -14288,7 +14269,7 @@ async function runEvalCase(options) {
|
|
|
14288
14269
|
};
|
|
14289
14270
|
try {
|
|
14290
14271
|
beforeAllOutput = await executeWorkspaceScript(
|
|
14291
|
-
evalCase.
|
|
14272
|
+
toScriptConfig(beforeAllHook, "before_all", `test '${evalCase.id}'`),
|
|
14292
14273
|
scriptContext
|
|
14293
14274
|
);
|
|
14294
14275
|
if (setupDebug) {
|
|
@@ -14313,7 +14294,9 @@ async function runEvalCase(options) {
|
|
|
14313
14294
|
}
|
|
14314
14295
|
}
|
|
14315
14296
|
}
|
|
14316
|
-
|
|
14297
|
+
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
|
|
14298
|
+
if (workspacePath && hasHookCommand(caseBeforeEachHook)) {
|
|
14299
|
+
const beforeEachHook = caseBeforeEachHook;
|
|
14317
14300
|
const scriptContext = {
|
|
14318
14301
|
workspacePath,
|
|
14319
14302
|
testId: evalCase.id,
|
|
@@ -14324,7 +14307,7 @@ async function runEvalCase(options) {
|
|
|
14324
14307
|
};
|
|
14325
14308
|
try {
|
|
14326
14309
|
beforeEachOutput = await executeWorkspaceScript(
|
|
14327
|
-
evalCase.
|
|
14310
|
+
toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
|
|
14328
14311
|
scriptContext
|
|
14329
14312
|
);
|
|
14330
14313
|
} catch (error) {
|
|
@@ -14412,8 +14395,8 @@ async function runEvalCase(options) {
|
|
|
14412
14395
|
}
|
|
14413
14396
|
return errorResult;
|
|
14414
14397
|
}
|
|
14415
|
-
if (
|
|
14416
|
-
await cache.set(
|
|
14398
|
+
if (cacheKey && cache && !cachedResponse) {
|
|
14399
|
+
await cache.set(cacheKey, providerResponse);
|
|
14417
14400
|
}
|
|
14418
14401
|
const output = providerResponse.output;
|
|
14419
14402
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -14441,17 +14424,19 @@ async function runEvalCase(options) {
|
|
|
14441
14424
|
}
|
|
14442
14425
|
}
|
|
14443
14426
|
const providerError = extractProviderError(providerResponse);
|
|
14444
|
-
if (repoManager && workspacePath && evalCase.workspace?.
|
|
14427
|
+
if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
|
|
14445
14428
|
try {
|
|
14446
14429
|
await repoManager.reset(
|
|
14447
14430
|
evalCase.workspace.repos,
|
|
14448
14431
|
workspacePath,
|
|
14449
|
-
evalCase.workspace.reset
|
|
14432
|
+
evalCase.workspace.hooks.after_each.reset
|
|
14450
14433
|
);
|
|
14451
14434
|
} catch {
|
|
14452
14435
|
}
|
|
14453
14436
|
}
|
|
14454
|
-
|
|
14437
|
+
const caseAfterEachHook = evalCase.workspace?.hooks?.after_each;
|
|
14438
|
+
if (workspacePath && hasHookCommand(caseAfterEachHook)) {
|
|
14439
|
+
const afterEachHook = caseAfterEachHook;
|
|
14455
14440
|
const scriptContext = {
|
|
14456
14441
|
workspacePath,
|
|
14457
14442
|
testId: evalCase.id,
|
|
@@ -14462,7 +14447,7 @@ async function runEvalCase(options) {
|
|
|
14462
14447
|
};
|
|
14463
14448
|
try {
|
|
14464
14449
|
afterEachOutput = await executeWorkspaceScript(
|
|
14465
|
-
evalCase.
|
|
14450
|
+
toScriptConfig(afterEachHook, "after_each", `test '${evalCase.id}'`),
|
|
14466
14451
|
scriptContext,
|
|
14467
14452
|
"warn"
|
|
14468
14453
|
);
|
|
@@ -14512,8 +14497,13 @@ async function runEvalCase(options) {
|
|
|
14512
14497
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14513
14498
|
});
|
|
14514
14499
|
} else if (isFailure) {
|
|
14515
|
-
|
|
14516
|
-
|
|
14500
|
+
if ((retainOnFailure ?? "keep") === "cleanup") {
|
|
14501
|
+
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14502
|
+
});
|
|
14503
|
+
} else {
|
|
14504
|
+
return { ...finalResult, workspacePath };
|
|
14505
|
+
}
|
|
14506
|
+
} else if ((retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup")) !== "keep") {
|
|
14517
14507
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14518
14508
|
});
|
|
14519
14509
|
}
|
|
@@ -14531,11 +14521,12 @@ async function runEvalCase(options) {
|
|
|
14531
14521
|
"evaluator_error"
|
|
14532
14522
|
);
|
|
14533
14523
|
if (workspacePath && !isSharedWorkspace) {
|
|
14534
|
-
if (forceCleanup) {
|
|
14524
|
+
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
14535
14525
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14536
14526
|
});
|
|
14527
|
+
} else {
|
|
14528
|
+
return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
|
|
14537
14529
|
}
|
|
14538
|
-
return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
|
|
14539
14530
|
}
|
|
14540
14531
|
return { ...errorResult, beforeEachOutput, afterEachOutput };
|
|
14541
14532
|
}
|
|
@@ -14554,7 +14545,9 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
14554
14545
|
useCache: false,
|
|
14555
14546
|
// Force cleanup for intermediate trials
|
|
14556
14547
|
cleanupWorkspaces: isLastDeclaredTrial ? options.cleanupWorkspaces : true,
|
|
14557
|
-
keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false
|
|
14548
|
+
keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false,
|
|
14549
|
+
retainOnSuccess: isLastDeclaredTrial ? options.retainOnSuccess : "cleanup",
|
|
14550
|
+
retainOnFailure: isLastDeclaredTrial ? options.retainOnFailure : "cleanup"
|
|
14558
14551
|
};
|
|
14559
14552
|
const result = await runEvalCase(trialOptions);
|
|
14560
14553
|
allResults.push(result);
|
|
@@ -15077,7 +15070,7 @@ function extractProviderError(response) {
|
|
|
15077
15070
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
15078
15071
|
}
|
|
15079
15072
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
15080
|
-
const hash =
|
|
15073
|
+
const hash = createHash2("sha256");
|
|
15081
15074
|
hash.update(provider.id);
|
|
15082
15075
|
hash.update(target.name);
|
|
15083
15076
|
hash.update(evalCase.id);
|
|
@@ -15145,7 +15138,7 @@ function computeWeightedMean(entries) {
|
|
|
15145
15138
|
}
|
|
15146
15139
|
|
|
15147
15140
|
// src/evaluation/evaluate.ts
|
|
15148
|
-
import { existsSync as
|
|
15141
|
+
import { existsSync as existsSync3 } from "node:fs";
|
|
15149
15142
|
import path40 from "node:path";
|
|
15150
15143
|
async function evaluate(config) {
|
|
15151
15144
|
const startTime = Date.now();
|
|
@@ -15264,7 +15257,7 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
15264
15257
|
for (const dir of chain) {
|
|
15265
15258
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
15266
15259
|
const targetsPath = path40.join(dir, candidate);
|
|
15267
|
-
if (!
|
|
15260
|
+
if (!existsSync3(targetsPath)) continue;
|
|
15268
15261
|
try {
|
|
15269
15262
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
15270
15263
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -15282,7 +15275,7 @@ async function loadEnvHierarchy(repoRoot) {
|
|
|
15282
15275
|
const envFiles = [];
|
|
15283
15276
|
for (const dir of chain) {
|
|
15284
15277
|
const envPath = path40.join(dir, ".env");
|
|
15285
|
-
if (
|
|
15278
|
+
if (existsSync3(envPath)) envFiles.push(envPath);
|
|
15286
15279
|
}
|
|
15287
15280
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
15288
15281
|
try {
|
|
@@ -15360,12 +15353,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
15360
15353
|
".agentv/config.js"
|
|
15361
15354
|
];
|
|
15362
15355
|
async function loadTsConfig(projectRoot) {
|
|
15363
|
-
const { existsSync:
|
|
15356
|
+
const { existsSync: existsSync4 } = await import("node:fs");
|
|
15364
15357
|
const { pathToFileURL } = await import("node:url");
|
|
15365
15358
|
const { join: join2 } = await import("node:path");
|
|
15366
15359
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
15367
15360
|
const filePath = join2(projectRoot, fileName);
|
|
15368
|
-
if (!
|
|
15361
|
+
if (!existsSync4(filePath)) {
|
|
15369
15362
|
continue;
|
|
15370
15363
|
}
|
|
15371
15364
|
try {
|
|
@@ -15462,7 +15455,7 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
15462
15455
|
}
|
|
15463
15456
|
|
|
15464
15457
|
// src/evaluation/cache/response-cache.ts
|
|
15465
|
-
import { mkdir as
|
|
15458
|
+
import { mkdir as mkdir13, readFile as readFile12, writeFile as writeFile8 } from "node:fs/promises";
|
|
15466
15459
|
import path41 from "node:path";
|
|
15467
15460
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
15468
15461
|
var ResponseCache = class {
|
|
@@ -15482,8 +15475,8 @@ var ResponseCache = class {
|
|
|
15482
15475
|
async set(key, value) {
|
|
15483
15476
|
const filePath = this.keyToPath(key);
|
|
15484
15477
|
const dir = path41.dirname(filePath);
|
|
15485
|
-
await
|
|
15486
|
-
await
|
|
15478
|
+
await mkdir13(dir, { recursive: true });
|
|
15479
|
+
await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
15487
15480
|
}
|
|
15488
15481
|
keyToPath(key) {
|
|
15489
15482
|
const prefix = key.slice(0, 2);
|
|
@@ -16017,7 +16010,6 @@ export {
|
|
|
16017
16010
|
freeformEvaluationSchema,
|
|
16018
16011
|
generateRubrics,
|
|
16019
16012
|
getAgentvHome,
|
|
16020
|
-
getGitCacheRoot,
|
|
16021
16013
|
getHitCount,
|
|
16022
16014
|
getSubagentsRoot,
|
|
16023
16015
|
getTraceStateRoot,
|