@agentv/core 2.14.3 → 2.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-N55K52OO.js → chunk-E6AJPAXM.js} +1 -1
- package/dist/chunk-E6AJPAXM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +8 -7
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +9 -8
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1079 -610
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +139 -34
- package/dist/index.d.ts +139 -34
- package/dist/index.js +1074 -607
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-N55K52OO.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-E6AJPAXM.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -612,6 +612,17 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
612
612
|
} else if (otelFile !== void 0) {
|
|
613
613
|
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
614
614
|
}
|
|
615
|
+
if (typeof obj.pool_workspaces === "boolean") {
|
|
616
|
+
result.pool_workspaces = obj.pool_workspaces;
|
|
617
|
+
} else if (obj.pool_workspaces !== void 0) {
|
|
618
|
+
logWarning(`Invalid execution.pool_workspaces in ${configPath}, expected boolean`);
|
|
619
|
+
}
|
|
620
|
+
const poolSlots = obj.pool_slots;
|
|
621
|
+
if (typeof poolSlots === "number" && Number.isInteger(poolSlots) && poolSlots >= 1 && poolSlots <= 50) {
|
|
622
|
+
result.pool_slots = poolSlots;
|
|
623
|
+
} else if (poolSlots !== void 0) {
|
|
624
|
+
logWarning(`Invalid execution.pool_slots in ${configPath}, expected integer 1-50`);
|
|
625
|
+
}
|
|
615
626
|
return Object.keys(result).length > 0 ? result : void 0;
|
|
616
627
|
}
|
|
617
628
|
function logWarning(message) {
|
|
@@ -2053,6 +2064,7 @@ async function processMessages(options) {
|
|
|
2053
2064
|
repoRootPath,
|
|
2054
2065
|
guidelinePatterns,
|
|
2055
2066
|
guidelinePaths,
|
|
2067
|
+
treatFileSegmentsAsGuidelines,
|
|
2056
2068
|
textParts,
|
|
2057
2069
|
messageType,
|
|
2058
2070
|
verbose
|
|
@@ -2100,16 +2112,20 @@ async function processMessages(options) {
|
|
|
2100
2112
|
}
|
|
2101
2113
|
try {
|
|
2102
2114
|
const fileContent = (await readFile4(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2115
|
+
const classifyAsGuideline = shouldTreatAsGuideline({
|
|
2116
|
+
messageType,
|
|
2117
|
+
resolvedPath,
|
|
2118
|
+
repoRootPath,
|
|
2119
|
+
guidelinePatterns,
|
|
2120
|
+
treatFileSegmentsAsGuidelines
|
|
2121
|
+
});
|
|
2122
|
+
if (classifyAsGuideline && guidelinePaths) {
|
|
2123
|
+
guidelinePaths.push(path5.resolve(resolvedPath));
|
|
2124
|
+
if (verbose) {
|
|
2125
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
2126
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
2112
2127
|
}
|
|
2128
|
+
continue;
|
|
2113
2129
|
}
|
|
2114
2130
|
segments.push({
|
|
2115
2131
|
type: "file",
|
|
@@ -2138,6 +2154,26 @@ async function processMessages(options) {
|
|
|
2138
2154
|
}
|
|
2139
2155
|
return segments;
|
|
2140
2156
|
}
|
|
2157
|
+
function shouldTreatAsGuideline(options) {
|
|
2158
|
+
const {
|
|
2159
|
+
messageType,
|
|
2160
|
+
resolvedPath,
|
|
2161
|
+
repoRootPath,
|
|
2162
|
+
guidelinePatterns,
|
|
2163
|
+
treatFileSegmentsAsGuidelines
|
|
2164
|
+
} = options;
|
|
2165
|
+
if (messageType !== "input") {
|
|
2166
|
+
return false;
|
|
2167
|
+
}
|
|
2168
|
+
if (treatFileSegmentsAsGuidelines) {
|
|
2169
|
+
return true;
|
|
2170
|
+
}
|
|
2171
|
+
if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
2172
|
+
return false;
|
|
2173
|
+
}
|
|
2174
|
+
const relativeToRepo = path5.relative(repoRootPath, resolvedPath);
|
|
2175
|
+
return isGuidelineFile(relativeToRepo, guidelinePatterns);
|
|
2176
|
+
}
|
|
2141
2177
|
function asString3(value) {
|
|
2142
2178
|
return typeof value === "string" ? value : void 0;
|
|
2143
2179
|
}
|
|
@@ -2476,6 +2512,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2476
2512
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
2477
2513
|
console.log(` - ${guidelinePath}`);
|
|
2478
2514
|
}
|
|
2515
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
2516
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
2479
2517
|
} else {
|
|
2480
2518
|
console.log(" No guidelines found");
|
|
2481
2519
|
}
|
|
@@ -2845,7 +2883,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2845
2883
|
} else {
|
|
2846
2884
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
2847
2885
|
}
|
|
2848
|
-
const suiteWorkspace =
|
|
2886
|
+
const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
|
|
2849
2887
|
const suiteInputMessages = expandInputShorthand(suite.input);
|
|
2850
2888
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
2851
2889
|
const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
|
|
@@ -2881,12 +2919,24 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2881
2919
|
}
|
|
2882
2920
|
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
2883
2921
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
2884
|
-
const
|
|
2922
|
+
const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
|
|
2923
|
+
const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
|
|
2885
2924
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
2886
2925
|
const guidelinePaths = [];
|
|
2887
2926
|
const inputTextParts = [];
|
|
2888
|
-
const
|
|
2889
|
-
messages:
|
|
2927
|
+
const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
|
|
2928
|
+
messages: effectiveSuiteInputMessages,
|
|
2929
|
+
searchRoots,
|
|
2930
|
+
repoRootPath,
|
|
2931
|
+
guidelinePatterns,
|
|
2932
|
+
guidelinePaths,
|
|
2933
|
+
treatFileSegmentsAsGuidelines: true,
|
|
2934
|
+
textParts: inputTextParts,
|
|
2935
|
+
messageType: "input",
|
|
2936
|
+
verbose
|
|
2937
|
+
}) : [];
|
|
2938
|
+
const testInputSegments = await processMessages({
|
|
2939
|
+
messages: testInputMessages,
|
|
2890
2940
|
searchRoots,
|
|
2891
2941
|
repoRootPath,
|
|
2892
2942
|
guidelinePatterns,
|
|
@@ -2895,6 +2945,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2895
2945
|
messageType: "input",
|
|
2896
2946
|
verbose
|
|
2897
2947
|
});
|
|
2948
|
+
const inputSegments = [...suiteInputSegments, ...testInputSegments];
|
|
2898
2949
|
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
2899
2950
|
messages: expectedMessages,
|
|
2900
2951
|
searchRoots,
|
|
@@ -2942,7 +2993,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2942
2993
|
...guidelinePaths.map((guidelinePath) => path8.resolve(guidelinePath)),
|
|
2943
2994
|
...userFilePaths
|
|
2944
2995
|
];
|
|
2945
|
-
const caseWorkspace =
|
|
2996
|
+
const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
|
|
2946
2997
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
2947
2998
|
const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
|
|
2948
2999
|
const caseTargets = extractTargetsFromTestCase(evalcase);
|
|
@@ -2973,6 +3024,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2973
3024
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
2974
3025
|
console.log(` - ${guidelinePath}`);
|
|
2975
3026
|
}
|
|
3027
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
3028
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
2976
3029
|
} else {
|
|
2977
3030
|
console.log(" No guidelines found");
|
|
2978
3031
|
}
|
|
@@ -3061,16 +3114,57 @@ function parseRepoConfig(raw) {
|
|
|
3061
3114
|
...clone !== void 0 && { clone }
|
|
3062
3115
|
};
|
|
3063
3116
|
}
|
|
3064
|
-
function
|
|
3117
|
+
function parseWorkspaceHookConfig(raw, evalFileDir) {
|
|
3065
3118
|
if (!isJsonObject(raw)) return void 0;
|
|
3119
|
+
const script = parseWorkspaceScriptConfig(raw, evalFileDir);
|
|
3066
3120
|
const obj = raw;
|
|
3067
|
-
const
|
|
3068
|
-
const
|
|
3069
|
-
if (!
|
|
3121
|
+
const reset = obj.reset === "none" || obj.reset === "fast" || obj.reset === "strict" ? obj.reset : void 0;
|
|
3122
|
+
const clean = obj.clean === "always" || obj.clean === "on_success" || obj.clean === "on_failure" || obj.clean === "never" ? obj.clean : void 0;
|
|
3123
|
+
if (!script && !reset && !clean) return void 0;
|
|
3070
3124
|
return {
|
|
3071
|
-
...
|
|
3072
|
-
...
|
|
3125
|
+
...script ?? {},
|
|
3126
|
+
...reset !== void 0 && { reset },
|
|
3127
|
+
...clean !== void 0 && { clean }
|
|
3128
|
+
};
|
|
3129
|
+
}
|
|
3130
|
+
function parseWorkspaceHooksConfig(raw, evalFileDir) {
|
|
3131
|
+
if (!isJsonObject(raw)) return void 0;
|
|
3132
|
+
const obj = raw;
|
|
3133
|
+
const beforeAllTests = parseWorkspaceHookConfig(obj.before_all_tests, evalFileDir);
|
|
3134
|
+
const beforeEachTest = parseWorkspaceHookConfig(obj.before_each_test, evalFileDir);
|
|
3135
|
+
const afterEachTest = parseWorkspaceHookConfig(obj.after_each_test, evalFileDir);
|
|
3136
|
+
const afterAllTests = parseWorkspaceHookConfig(obj.after_all_tests, evalFileDir);
|
|
3137
|
+
const onReuse = parseWorkspaceHookConfig(obj.on_reuse, evalFileDir);
|
|
3138
|
+
const onFinish = parseWorkspaceHookConfig(obj.on_finish, evalFileDir);
|
|
3139
|
+
const hooks = {
|
|
3140
|
+
...beforeAllTests !== void 0 && { before_all_tests: beforeAllTests },
|
|
3141
|
+
...beforeEachTest !== void 0 && { before_each_test: beforeEachTest },
|
|
3142
|
+
...afterEachTest !== void 0 && { after_each_test: afterEachTest },
|
|
3143
|
+
...afterAllTests !== void 0 && { after_all_tests: afterAllTests },
|
|
3144
|
+
...onReuse !== void 0 && { on_reuse: onReuse },
|
|
3145
|
+
...onFinish !== void 0 && { on_finish: onFinish }
|
|
3073
3146
|
};
|
|
3147
|
+
return Object.keys(hooks).length > 0 ? hooks : void 0;
|
|
3148
|
+
}
|
|
3149
|
+
async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
3150
|
+
if (typeof raw === "string") {
|
|
3151
|
+
const workspaceFilePath = path8.resolve(evalFileDir, raw);
|
|
3152
|
+
let content;
|
|
3153
|
+
try {
|
|
3154
|
+
content = await readFile7(workspaceFilePath, "utf8");
|
|
3155
|
+
} catch {
|
|
3156
|
+
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
3157
|
+
}
|
|
3158
|
+
const parsed = parse2(content);
|
|
3159
|
+
if (!isJsonObject(parsed)) {
|
|
3160
|
+
throw new Error(
|
|
3161
|
+
`Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
|
|
3162
|
+
);
|
|
3163
|
+
}
|
|
3164
|
+
const workspaceFileDir = path8.dirname(workspaceFilePath);
|
|
3165
|
+
return parseWorkspaceConfig(parsed, workspaceFileDir);
|
|
3166
|
+
}
|
|
3167
|
+
return parseWorkspaceConfig(raw, evalFileDir);
|
|
3074
3168
|
}
|
|
3075
3169
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
3076
3170
|
if (!isJsonObject(raw)) return void 0;
|
|
@@ -3081,37 +3175,56 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
3081
3175
|
}
|
|
3082
3176
|
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
3083
3177
|
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
3084
|
-
const
|
|
3085
|
-
const
|
|
3086
|
-
const
|
|
3087
|
-
const
|
|
3088
|
-
|
|
3089
|
-
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
3178
|
+
const hooks = parseWorkspaceHooksConfig(obj.hooks, evalFileDir);
|
|
3179
|
+
const mode = obj.mode === "pooled" || obj.mode === "ephemeral" || obj.mode === "static" ? obj.mode : void 0;
|
|
3180
|
+
const staticPath = typeof obj.static_path === "string" ? obj.static_path : void 0;
|
|
3181
|
+
const pool = typeof obj.pool === "boolean" ? obj.pool : void 0;
|
|
3182
|
+
if (!template && !isolation && !repos && !hooks && !mode && !staticPath && pool === void 0)
|
|
3090
3183
|
return void 0;
|
|
3091
3184
|
return {
|
|
3092
3185
|
...template !== void 0 && { template },
|
|
3093
3186
|
...isolation !== void 0 && { isolation },
|
|
3094
3187
|
...repos !== void 0 && { repos },
|
|
3095
|
-
...
|
|
3096
|
-
...
|
|
3097
|
-
...
|
|
3098
|
-
...
|
|
3099
|
-
...afterEach !== void 0 && { after_each: afterEach }
|
|
3188
|
+
...hooks !== void 0 && { hooks },
|
|
3189
|
+
...mode !== void 0 && { mode },
|
|
3190
|
+
...staticPath !== void 0 && { static_path: staticPath },
|
|
3191
|
+
...pool !== void 0 && { pool }
|
|
3100
3192
|
};
|
|
3101
3193
|
}
|
|
3102
3194
|
function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
3103
3195
|
if (!suiteLevel && !caseLevel) return void 0;
|
|
3104
3196
|
if (!suiteLevel) return caseLevel;
|
|
3105
3197
|
if (!caseLevel) return suiteLevel;
|
|
3198
|
+
const mergeHook = (suiteHook, caseHook) => {
|
|
3199
|
+
if (!suiteHook && !caseHook) return void 0;
|
|
3200
|
+
return {
|
|
3201
|
+
...suiteHook ?? {},
|
|
3202
|
+
...caseHook ?? {}
|
|
3203
|
+
};
|
|
3204
|
+
};
|
|
3205
|
+
const mergedHooks = {
|
|
3206
|
+
before_all_tests: mergeHook(
|
|
3207
|
+
suiteLevel.hooks?.before_all_tests,
|
|
3208
|
+
caseLevel.hooks?.before_all_tests
|
|
3209
|
+
),
|
|
3210
|
+
before_each_test: mergeHook(
|
|
3211
|
+
suiteLevel.hooks?.before_each_test,
|
|
3212
|
+
caseLevel.hooks?.before_each_test
|
|
3213
|
+
),
|
|
3214
|
+
after_each_test: mergeHook(suiteLevel.hooks?.after_each_test, caseLevel.hooks?.after_each_test),
|
|
3215
|
+
after_all_tests: mergeHook(suiteLevel.hooks?.after_all_tests, caseLevel.hooks?.after_all_tests),
|
|
3216
|
+
on_reuse: mergeHook(suiteLevel.hooks?.on_reuse, caseLevel.hooks?.on_reuse),
|
|
3217
|
+
on_finish: mergeHook(suiteLevel.hooks?.on_finish, caseLevel.hooks?.on_finish)
|
|
3218
|
+
};
|
|
3219
|
+
const hasHooks = Object.values(mergedHooks).some((hook) => hook !== void 0);
|
|
3106
3220
|
return {
|
|
3107
3221
|
template: caseLevel.template ?? suiteLevel.template,
|
|
3108
3222
|
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
3109
3223
|
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
after_each: caseLevel.after_each ?? suiteLevel.after_each
|
|
3224
|
+
...hasHooks && { hooks: mergedHooks },
|
|
3225
|
+
mode: caseLevel.mode ?? suiteLevel.mode,
|
|
3226
|
+
static_path: caseLevel.static_path ?? suiteLevel.static_path,
|
|
3227
|
+
pool: caseLevel.pool ?? suiteLevel.pool
|
|
3115
3228
|
};
|
|
3116
3229
|
}
|
|
3117
3230
|
function asString6(value) {
|
|
@@ -7165,15 +7278,15 @@ function getAgentvHome() {
|
|
|
7165
7278
|
function getWorkspacesRoot() {
|
|
7166
7279
|
return path21.join(getAgentvHome(), "workspaces");
|
|
7167
7280
|
}
|
|
7168
|
-
function getGitCacheRoot() {
|
|
7169
|
-
return path21.join(getAgentvHome(), "git-cache");
|
|
7170
|
-
}
|
|
7171
7281
|
function getSubagentsRoot() {
|
|
7172
7282
|
return path21.join(getAgentvHome(), "subagents");
|
|
7173
7283
|
}
|
|
7174
7284
|
function getTraceStateRoot() {
|
|
7175
7285
|
return path21.join(getAgentvHome(), "trace-state");
|
|
7176
7286
|
}
|
|
7287
|
+
function getWorkspacePoolRoot() {
|
|
7288
|
+
return path21.join(getAgentvHome(), "workspace-pool");
|
|
7289
|
+
}
|
|
7177
7290
|
|
|
7178
7291
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
7179
7292
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
@@ -7996,8 +8109,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
7996
8109
|
|
|
7997
8110
|
**IMPORTANT**: Follow these exact steps:
|
|
7998
8111
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
7999
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
8000
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
8001
8112
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
8002
8113
|
\`\`\`
|
|
8003
8114
|
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
@@ -8014,8 +8125,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
8014
8125
|
|
|
8015
8126
|
**IMPORTANT**: Follow these exact steps:
|
|
8016
8127
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
8017
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
8018
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
8019
8128
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
8020
8129
|
3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
|
|
8021
8130
|
`;
|
|
@@ -8628,15 +8737,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
8628
8737
|
});
|
|
8629
8738
|
}
|
|
8630
8739
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
8631
|
-
const { mkdir: mkdir14, readFile:
|
|
8740
|
+
const { mkdir: mkdir14, readFile: readFile13, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
8632
8741
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
8633
|
-
const
|
|
8742
|
+
const path42 = await import("node:path");
|
|
8634
8743
|
const { randomUUID: randomUUID8 } = await import("node:crypto");
|
|
8635
|
-
const dir =
|
|
8744
|
+
const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
8636
8745
|
await mkdir14(dir, { recursive: true });
|
|
8637
|
-
const stdinPath =
|
|
8638
|
-
const stdoutPath =
|
|
8639
|
-
const stderrPath =
|
|
8746
|
+
const stdinPath = path42.join(dir, "stdin.txt");
|
|
8747
|
+
const stdoutPath = path42.join(dir, "stdout.txt");
|
|
8748
|
+
const stderrPath = path42.join(dir, "stderr.txt");
|
|
8640
8749
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
8641
8750
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
8642
8751
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
@@ -8666,8 +8775,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
8666
8775
|
resolve(code ?? 0);
|
|
8667
8776
|
});
|
|
8668
8777
|
});
|
|
8669
|
-
const stdout = (await
|
|
8670
|
-
const stderr = (await
|
|
8778
|
+
const stdout = (await readFile13(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
8779
|
+
const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
8671
8780
|
return { stdout, stderr, exitCode };
|
|
8672
8781
|
} finally {
|
|
8673
8782
|
await rm6(dir, { recursive: true, force: true });
|
|
@@ -8988,7 +9097,7 @@ var CodeEvaluator = class {
|
|
|
8988
9097
|
outputPath,
|
|
8989
9098
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
8990
9099
|
inputFiles: context.evalCase.file_paths.filter(
|
|
8991
|
-
(
|
|
9100
|
+
(path42) => !context.evalCase.guideline_paths.includes(path42)
|
|
8992
9101
|
),
|
|
8993
9102
|
input: context.evalCase.input,
|
|
8994
9103
|
trace: context.trace ?? null,
|
|
@@ -9238,6 +9347,8 @@ ${context.fileChanges}`;
|
|
|
9238
9347
|
};
|
|
9239
9348
|
} catch (e) {
|
|
9240
9349
|
const message = e instanceof Error ? e.message : String(e);
|
|
9350
|
+
const evalName = context.evaluator?.name ?? "llm-judge";
|
|
9351
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
9241
9352
|
return {
|
|
9242
9353
|
score: 0,
|
|
9243
9354
|
verdict: "skip",
|
|
@@ -9266,24 +9377,39 @@ ${context.fileChanges}`;
|
|
|
9266
9377
|
systemPrompt,
|
|
9267
9378
|
target: judgeProvider.targetName
|
|
9268
9379
|
};
|
|
9269
|
-
|
|
9270
|
-
|
|
9271
|
-
|
|
9272
|
-
|
|
9273
|
-
|
|
9274
|
-
|
|
9275
|
-
|
|
9276
|
-
|
|
9277
|
-
|
|
9278
|
-
|
|
9279
|
-
|
|
9280
|
-
|
|
9281
|
-
|
|
9282
|
-
|
|
9283
|
-
|
|
9284
|
-
|
|
9285
|
-
|
|
9286
|
-
|
|
9380
|
+
try {
|
|
9381
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
9382
|
+
context,
|
|
9383
|
+
judgeProvider,
|
|
9384
|
+
systemPrompt,
|
|
9385
|
+
userPrompt: prompt,
|
|
9386
|
+
schema: rubricEvaluationSchema
|
|
9387
|
+
});
|
|
9388
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
9389
|
+
return {
|
|
9390
|
+
score,
|
|
9391
|
+
verdict,
|
|
9392
|
+
hits,
|
|
9393
|
+
misses,
|
|
9394
|
+
expectedAspectCount: rubrics.length,
|
|
9395
|
+
reasoning: data.overall_reasoning,
|
|
9396
|
+
evaluatorRawRequest,
|
|
9397
|
+
tokenUsage
|
|
9398
|
+
};
|
|
9399
|
+
} catch (e) {
|
|
9400
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
9401
|
+
const evalName = context.evaluator?.name ?? "llm-judge";
|
|
9402
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
9403
|
+
return {
|
|
9404
|
+
score: 0,
|
|
9405
|
+
verdict: "skip",
|
|
9406
|
+
hits: [],
|
|
9407
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
9408
|
+
expectedAspectCount: rubrics.length,
|
|
9409
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
9410
|
+
evaluatorRawRequest
|
|
9411
|
+
};
|
|
9412
|
+
}
|
|
9287
9413
|
}
|
|
9288
9414
|
/**
|
|
9289
9415
|
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
@@ -9297,25 +9423,40 @@ ${context.fileChanges}`;
|
|
|
9297
9423
|
systemPrompt,
|
|
9298
9424
|
target: judgeProvider.targetName
|
|
9299
9425
|
};
|
|
9300
|
-
|
|
9301
|
-
|
|
9302
|
-
|
|
9303
|
-
|
|
9304
|
-
|
|
9305
|
-
|
|
9306
|
-
|
|
9307
|
-
|
|
9308
|
-
|
|
9309
|
-
|
|
9310
|
-
|
|
9311
|
-
|
|
9312
|
-
|
|
9313
|
-
|
|
9314
|
-
|
|
9315
|
-
|
|
9316
|
-
|
|
9317
|
-
|
|
9318
|
-
|
|
9426
|
+
try {
|
|
9427
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
9428
|
+
context,
|
|
9429
|
+
judgeProvider,
|
|
9430
|
+
systemPrompt,
|
|
9431
|
+
userPrompt: prompt,
|
|
9432
|
+
schema: scoreRangeEvaluationSchema
|
|
9433
|
+
});
|
|
9434
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
9435
|
+
return {
|
|
9436
|
+
score,
|
|
9437
|
+
verdict,
|
|
9438
|
+
hits,
|
|
9439
|
+
misses,
|
|
9440
|
+
expectedAspectCount: rubrics.length,
|
|
9441
|
+
reasoning: data.overall_reasoning,
|
|
9442
|
+
evaluatorRawRequest,
|
|
9443
|
+
details,
|
|
9444
|
+
tokenUsage
|
|
9445
|
+
};
|
|
9446
|
+
} catch (e) {
|
|
9447
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
9448
|
+
const evalName = context.evaluator?.name ?? "llm-judge";
|
|
9449
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
9450
|
+
return {
|
|
9451
|
+
score: 0,
|
|
9452
|
+
verdict: "skip",
|
|
9453
|
+
hits: [],
|
|
9454
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
9455
|
+
expectedAspectCount: rubrics.length,
|
|
9456
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
9457
|
+
evaluatorRawRequest
|
|
9458
|
+
};
|
|
9459
|
+
}
|
|
9319
9460
|
}
|
|
9320
9461
|
/**
|
|
9321
9462
|
* Build prompt for score-range rubric evaluation.
|
|
@@ -9601,19 +9742,13 @@ var CompositeEvaluator = class {
|
|
|
9601
9742
|
runWeightedAverage(results, weights) {
|
|
9602
9743
|
let totalWeight = 0;
|
|
9603
9744
|
let weightedSum = 0;
|
|
9745
|
+
let evaluatedCount = 0;
|
|
9604
9746
|
const allHits = [];
|
|
9605
9747
|
const allMisses = [];
|
|
9606
9748
|
const reasoningParts = [];
|
|
9607
9749
|
const scores = [];
|
|
9608
9750
|
for (const member of results) {
|
|
9609
9751
|
const weight = weights?.[member.id] ?? 1;
|
|
9610
|
-
totalWeight += weight;
|
|
9611
|
-
weightedSum += member.result.score * weight;
|
|
9612
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9613
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9614
|
-
if (member.result.reasoning) {
|
|
9615
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9616
|
-
}
|
|
9617
9752
|
scores.push({
|
|
9618
9753
|
name: member.id,
|
|
9619
9754
|
type: member.type,
|
|
@@ -9628,6 +9763,32 @@ var CompositeEvaluator = class {
|
|
|
9628
9763
|
details: member.result.details,
|
|
9629
9764
|
tokenUsage: member.result.tokenUsage
|
|
9630
9765
|
});
|
|
9766
|
+
if (member.result.verdict === "skip") {
|
|
9767
|
+
continue;
|
|
9768
|
+
}
|
|
9769
|
+
evaluatedCount++;
|
|
9770
|
+
totalWeight += weight;
|
|
9771
|
+
weightedSum += member.result.score * weight;
|
|
9772
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9773
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9774
|
+
if (member.result.reasoning) {
|
|
9775
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9776
|
+
}
|
|
9777
|
+
}
|
|
9778
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
9779
|
+
return {
|
|
9780
|
+
score: 0,
|
|
9781
|
+
verdict: "skip",
|
|
9782
|
+
hits: [],
|
|
9783
|
+
misses: [],
|
|
9784
|
+
expectedAspectCount: 1,
|
|
9785
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
9786
|
+
evaluatorRawRequest: {
|
|
9787
|
+
aggregator: "weighted_average",
|
|
9788
|
+
...weights ? { weights } : {}
|
|
9789
|
+
},
|
|
9790
|
+
scores
|
|
9791
|
+
};
|
|
9631
9792
|
}
|
|
9632
9793
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
9633
9794
|
return {
|
|
@@ -9651,19 +9812,8 @@ var CompositeEvaluator = class {
|
|
|
9651
9812
|
const reasoningParts = [];
|
|
9652
9813
|
let passingCount = 0;
|
|
9653
9814
|
let borderlineCount = 0;
|
|
9815
|
+
let evaluatedCount = 0;
|
|
9654
9816
|
for (const member of results) {
|
|
9655
|
-
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
9656
|
-
if (isPassing) {
|
|
9657
|
-
passingCount++;
|
|
9658
|
-
if (member.result.verdict === "borderline") {
|
|
9659
|
-
borderlineCount++;
|
|
9660
|
-
}
|
|
9661
|
-
}
|
|
9662
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9663
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9664
|
-
if (member.result.reasoning) {
|
|
9665
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9666
|
-
}
|
|
9667
9817
|
scores.push({
|
|
9668
9818
|
name: member.id,
|
|
9669
9819
|
type: member.type,
|
|
@@ -9677,8 +9827,39 @@ var CompositeEvaluator = class {
|
|
|
9677
9827
|
details: member.result.details,
|
|
9678
9828
|
tokenUsage: member.result.tokenUsage
|
|
9679
9829
|
});
|
|
9830
|
+
if (member.result.verdict === "skip") {
|
|
9831
|
+
continue;
|
|
9832
|
+
}
|
|
9833
|
+
evaluatedCount++;
|
|
9834
|
+
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
9835
|
+
if (isPassing) {
|
|
9836
|
+
passingCount++;
|
|
9837
|
+
if (member.result.verdict === "borderline") {
|
|
9838
|
+
borderlineCount++;
|
|
9839
|
+
}
|
|
9840
|
+
}
|
|
9841
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9842
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9843
|
+
if (member.result.reasoning) {
|
|
9844
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9845
|
+
}
|
|
9846
|
+
}
|
|
9847
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
9848
|
+
return {
|
|
9849
|
+
score: 0,
|
|
9850
|
+
verdict: "skip",
|
|
9851
|
+
hits: [],
|
|
9852
|
+
misses: [],
|
|
9853
|
+
expectedAspectCount: 1,
|
|
9854
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
9855
|
+
evaluatorRawRequest: {
|
|
9856
|
+
aggregator: "threshold",
|
|
9857
|
+
threshold
|
|
9858
|
+
},
|
|
9859
|
+
scores
|
|
9860
|
+
};
|
|
9680
9861
|
}
|
|
9681
|
-
const totalCount =
|
|
9862
|
+
const totalCount = evaluatedCount;
|
|
9682
9863
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
9683
9864
|
const pass = score >= threshold;
|
|
9684
9865
|
if (pass && borderlineCount > 0) {
|
|
@@ -10186,115 +10367,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
10186
10367
|
* Evaluate a single field against the expected value.
|
|
10187
10368
|
*/
|
|
10188
10369
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
10189
|
-
const { path:
|
|
10190
|
-
const candidateValue = resolvePath(candidateData,
|
|
10191
|
-
const expectedValue = resolvePath(expectedData,
|
|
10370
|
+
const { path: path42, match, required = true, weight = 1 } = fieldConfig;
|
|
10371
|
+
const candidateValue = resolvePath(candidateData, path42);
|
|
10372
|
+
const expectedValue = resolvePath(expectedData, path42);
|
|
10192
10373
|
if (expectedValue === void 0) {
|
|
10193
10374
|
return {
|
|
10194
|
-
path:
|
|
10375
|
+
path: path42,
|
|
10195
10376
|
score: 1,
|
|
10196
10377
|
// No expected value means no comparison needed
|
|
10197
10378
|
weight,
|
|
10198
10379
|
hit: true,
|
|
10199
|
-
message: `${
|
|
10380
|
+
message: `${path42}: no expected value`
|
|
10200
10381
|
};
|
|
10201
10382
|
}
|
|
10202
10383
|
if (candidateValue === void 0) {
|
|
10203
10384
|
if (required) {
|
|
10204
10385
|
return {
|
|
10205
|
-
path:
|
|
10386
|
+
path: path42,
|
|
10206
10387
|
score: 0,
|
|
10207
10388
|
weight,
|
|
10208
10389
|
hit: false,
|
|
10209
|
-
message: `${
|
|
10390
|
+
message: `${path42} (required, missing)`
|
|
10210
10391
|
};
|
|
10211
10392
|
}
|
|
10212
10393
|
return {
|
|
10213
|
-
path:
|
|
10394
|
+
path: path42,
|
|
10214
10395
|
score: 1,
|
|
10215
10396
|
// Don't penalize missing optional fields
|
|
10216
10397
|
weight: 0,
|
|
10217
10398
|
// Zero weight means it won't affect the score
|
|
10218
10399
|
hit: true,
|
|
10219
|
-
message: `${
|
|
10400
|
+
message: `${path42}: optional field missing`
|
|
10220
10401
|
};
|
|
10221
10402
|
}
|
|
10222
10403
|
switch (match) {
|
|
10223
10404
|
case "exact":
|
|
10224
|
-
return this.compareExact(
|
|
10405
|
+
return this.compareExact(path42, candidateValue, expectedValue, weight);
|
|
10225
10406
|
case "numeric_tolerance":
|
|
10226
10407
|
return this.compareNumericTolerance(
|
|
10227
|
-
|
|
10408
|
+
path42,
|
|
10228
10409
|
candidateValue,
|
|
10229
10410
|
expectedValue,
|
|
10230
10411
|
fieldConfig,
|
|
10231
10412
|
weight
|
|
10232
10413
|
);
|
|
10233
10414
|
case "date":
|
|
10234
|
-
return this.compareDate(
|
|
10415
|
+
return this.compareDate(path42, candidateValue, expectedValue, fieldConfig, weight);
|
|
10235
10416
|
default:
|
|
10236
10417
|
return {
|
|
10237
|
-
path:
|
|
10418
|
+
path: path42,
|
|
10238
10419
|
score: 0,
|
|
10239
10420
|
weight,
|
|
10240
10421
|
hit: false,
|
|
10241
|
-
message: `${
|
|
10422
|
+
message: `${path42}: unknown match type "${match}"`
|
|
10242
10423
|
};
|
|
10243
10424
|
}
|
|
10244
10425
|
}
|
|
10245
10426
|
/**
|
|
10246
10427
|
* Exact equality comparison.
|
|
10247
10428
|
*/
|
|
10248
|
-
compareExact(
|
|
10429
|
+
compareExact(path42, candidateValue, expectedValue, weight) {
|
|
10249
10430
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
10250
10431
|
return {
|
|
10251
|
-
path:
|
|
10432
|
+
path: path42,
|
|
10252
10433
|
score: 1,
|
|
10253
10434
|
weight,
|
|
10254
10435
|
hit: true,
|
|
10255
|
-
message:
|
|
10436
|
+
message: path42
|
|
10256
10437
|
};
|
|
10257
10438
|
}
|
|
10258
10439
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
10259
10440
|
return {
|
|
10260
|
-
path:
|
|
10441
|
+
path: path42,
|
|
10261
10442
|
score: 0,
|
|
10262
10443
|
weight,
|
|
10263
10444
|
hit: false,
|
|
10264
|
-
message: `${
|
|
10445
|
+
message: `${path42} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
10265
10446
|
};
|
|
10266
10447
|
}
|
|
10267
10448
|
return {
|
|
10268
|
-
path:
|
|
10449
|
+
path: path42,
|
|
10269
10450
|
score: 0,
|
|
10270
10451
|
weight,
|
|
10271
10452
|
hit: false,
|
|
10272
|
-
message: `${
|
|
10453
|
+
message: `${path42} (value mismatch)`
|
|
10273
10454
|
};
|
|
10274
10455
|
}
|
|
10275
10456
|
/**
|
|
10276
10457
|
* Numeric comparison with absolute or relative tolerance.
|
|
10277
10458
|
*/
|
|
10278
|
-
compareNumericTolerance(
|
|
10459
|
+
compareNumericTolerance(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
10279
10460
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
10280
10461
|
const candidateNum = toNumber2(candidateValue);
|
|
10281
10462
|
const expectedNum = toNumber2(expectedValue);
|
|
10282
10463
|
if (candidateNum === null || expectedNum === null) {
|
|
10283
10464
|
return {
|
|
10284
|
-
path:
|
|
10465
|
+
path: path42,
|
|
10285
10466
|
score: 0,
|
|
10286
10467
|
weight,
|
|
10287
10468
|
hit: false,
|
|
10288
|
-
message: `${
|
|
10469
|
+
message: `${path42} (non-numeric value)`
|
|
10289
10470
|
};
|
|
10290
10471
|
}
|
|
10291
10472
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
10292
10473
|
return {
|
|
10293
|
-
path:
|
|
10474
|
+
path: path42,
|
|
10294
10475
|
score: 0,
|
|
10295
10476
|
weight,
|
|
10296
10477
|
hit: false,
|
|
10297
|
-
message: `${
|
|
10478
|
+
message: `${path42} (invalid numeric value)`
|
|
10298
10479
|
};
|
|
10299
10480
|
}
|
|
10300
10481
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -10307,61 +10488,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
10307
10488
|
}
|
|
10308
10489
|
if (withinTolerance) {
|
|
10309
10490
|
return {
|
|
10310
|
-
path:
|
|
10491
|
+
path: path42,
|
|
10311
10492
|
score: 1,
|
|
10312
10493
|
weight,
|
|
10313
10494
|
hit: true,
|
|
10314
|
-
message: `${
|
|
10495
|
+
message: `${path42} (within tolerance: diff=${diff.toFixed(2)})`
|
|
10315
10496
|
};
|
|
10316
10497
|
}
|
|
10317
10498
|
return {
|
|
10318
|
-
path:
|
|
10499
|
+
path: path42,
|
|
10319
10500
|
score: 0,
|
|
10320
10501
|
weight,
|
|
10321
10502
|
hit: false,
|
|
10322
|
-
message: `${
|
|
10503
|
+
message: `${path42} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
10323
10504
|
};
|
|
10324
10505
|
}
|
|
10325
10506
|
/**
|
|
10326
10507
|
* Date comparison with format normalization.
|
|
10327
10508
|
*/
|
|
10328
|
-
compareDate(
|
|
10509
|
+
compareDate(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
10329
10510
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
10330
10511
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
10331
10512
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
10332
10513
|
if (candidateDate === null) {
|
|
10333
10514
|
return {
|
|
10334
|
-
path:
|
|
10515
|
+
path: path42,
|
|
10335
10516
|
score: 0,
|
|
10336
10517
|
weight,
|
|
10337
10518
|
hit: false,
|
|
10338
|
-
message: `${
|
|
10519
|
+
message: `${path42} (unparseable candidate date)`
|
|
10339
10520
|
};
|
|
10340
10521
|
}
|
|
10341
10522
|
if (expectedDate === null) {
|
|
10342
10523
|
return {
|
|
10343
|
-
path:
|
|
10524
|
+
path: path42,
|
|
10344
10525
|
score: 0,
|
|
10345
10526
|
weight,
|
|
10346
10527
|
hit: false,
|
|
10347
|
-
message: `${
|
|
10528
|
+
message: `${path42} (unparseable expected date)`
|
|
10348
10529
|
};
|
|
10349
10530
|
}
|
|
10350
10531
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
10351
10532
|
return {
|
|
10352
|
-
path:
|
|
10533
|
+
path: path42,
|
|
10353
10534
|
score: 1,
|
|
10354
10535
|
weight,
|
|
10355
10536
|
hit: true,
|
|
10356
|
-
message:
|
|
10537
|
+
message: path42
|
|
10357
10538
|
};
|
|
10358
10539
|
}
|
|
10359
10540
|
return {
|
|
10360
|
-
path:
|
|
10541
|
+
path: path42,
|
|
10361
10542
|
score: 0,
|
|
10362
10543
|
weight,
|
|
10363
10544
|
hit: false,
|
|
10364
|
-
message: `${
|
|
10545
|
+
message: `${path42} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
10365
10546
|
};
|
|
10366
10547
|
}
|
|
10367
10548
|
/**
|
|
@@ -10402,11 +10583,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
10402
10583
|
};
|
|
10403
10584
|
}
|
|
10404
10585
|
};
|
|
10405
|
-
function resolvePath(obj,
|
|
10406
|
-
if (!
|
|
10586
|
+
function resolvePath(obj, path42) {
|
|
10587
|
+
if (!path42 || !obj) {
|
|
10407
10588
|
return void 0;
|
|
10408
10589
|
}
|
|
10409
|
-
const parts =
|
|
10590
|
+
const parts = path42.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
10410
10591
|
let current = obj;
|
|
10411
10592
|
for (const part of parts) {
|
|
10412
10593
|
if (current === null || current === void 0) {
|
|
@@ -11224,8 +11405,8 @@ var TokenUsageEvaluator = class {
|
|
|
11224
11405
|
};
|
|
11225
11406
|
|
|
11226
11407
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
11227
|
-
function getNestedValue(obj,
|
|
11228
|
-
const parts =
|
|
11408
|
+
function getNestedValue(obj, path42) {
|
|
11409
|
+
const parts = path42.split(".");
|
|
11229
11410
|
let current = obj;
|
|
11230
11411
|
for (const part of parts) {
|
|
11231
11412
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -11788,7 +11969,7 @@ function runEqualsAssertion(output, value) {
|
|
|
11788
11969
|
// src/evaluation/orchestrator.ts
|
|
11789
11970
|
import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
|
|
11790
11971
|
import { mkdir as mkdir12, stat as stat7 } from "node:fs/promises";
|
|
11791
|
-
import
|
|
11972
|
+
import path39 from "node:path";
|
|
11792
11973
|
import micromatch4 from "micromatch";
|
|
11793
11974
|
|
|
11794
11975
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -12658,16 +12839,14 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
12658
12839
|
}
|
|
12659
12840
|
}
|
|
12660
12841
|
|
|
12661
|
-
// src/evaluation/workspace/
|
|
12842
|
+
// src/evaluation/workspace/pool-manager.ts
|
|
12662
12843
|
import { execFile } from "node:child_process";
|
|
12663
12844
|
import { createHash } from "node:crypto";
|
|
12664
12845
|
import { existsSync as existsSync2 } from "node:fs";
|
|
12665
|
-
import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
|
|
12846
|
+
import { cp as cp2, mkdir as mkdir11, readFile as readFile11, readdir as readdir4, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
|
|
12666
12847
|
import path36 from "node:path";
|
|
12667
12848
|
import { promisify as promisify5 } from "node:util";
|
|
12668
12849
|
var execFileAsync = promisify5(execFile);
|
|
12669
|
-
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
12670
|
-
var LOCK_TIMEOUT_MS = 6e4;
|
|
12671
12850
|
function gitEnv() {
|
|
12672
12851
|
const env = { ...process.env };
|
|
12673
12852
|
for (const key of Object.keys(env)) {
|
|
@@ -12682,160 +12861,326 @@ function gitEnv() {
|
|
|
12682
12861
|
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
12683
12862
|
};
|
|
12684
12863
|
}
|
|
12685
|
-
function cacheKey(source) {
|
|
12686
|
-
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
12687
|
-
return createHash("sha256").update(raw).digest("hex");
|
|
12688
|
-
}
|
|
12689
|
-
function getSourceUrl(source) {
|
|
12690
|
-
return source.type === "git" ? source.url : source.path;
|
|
12691
|
-
}
|
|
12692
12864
|
async function git(args, opts) {
|
|
12693
12865
|
const { stdout } = await execFileAsync("git", args, {
|
|
12694
12866
|
cwd: opts?.cwd,
|
|
12695
|
-
timeout: opts?.timeout ??
|
|
12867
|
+
timeout: opts?.timeout ?? 3e5,
|
|
12696
12868
|
env: gitEnv(),
|
|
12697
12869
|
maxBuffer: 50 * 1024 * 1024
|
|
12698
|
-
// 50MB
|
|
12699
12870
|
});
|
|
12700
12871
|
return stdout.trim();
|
|
12701
12872
|
}
|
|
12702
|
-
|
|
12703
|
-
const
|
|
12704
|
-
|
|
12705
|
-
|
|
12706
|
-
|
|
12707
|
-
|
|
12708
|
-
|
|
12709
|
-
|
|
12710
|
-
|
|
12873
|
+
function normalizeRepoForFingerprint(repo) {
|
|
12874
|
+
const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
|
|
12875
|
+
const result = {
|
|
12876
|
+
path: repo.path,
|
|
12877
|
+
source,
|
|
12878
|
+
ref: repo.checkout?.ref ?? "HEAD"
|
|
12879
|
+
};
|
|
12880
|
+
if (repo.clone?.depth !== void 0) {
|
|
12881
|
+
result.depth = repo.clone.depth;
|
|
12882
|
+
}
|
|
12883
|
+
if (repo.clone?.filter !== void 0) {
|
|
12884
|
+
result.filter = repo.clone.filter;
|
|
12885
|
+
}
|
|
12886
|
+
if (repo.clone?.sparse?.length) {
|
|
12887
|
+
result.sparse = [...repo.clone.sparse].sort();
|
|
12888
|
+
}
|
|
12889
|
+
return result;
|
|
12890
|
+
}
|
|
12891
|
+
function computeWorkspaceFingerprint(templatePath, repos) {
|
|
12892
|
+
const canonical = {
|
|
12893
|
+
templatePath: templatePath ?? null,
|
|
12894
|
+
repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
|
|
12895
|
+
};
|
|
12896
|
+
return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
12897
|
+
}
|
|
12898
|
+
async function copyDirectoryRecursive2(src, dest, skipDirs) {
|
|
12899
|
+
await mkdir11(dest, { recursive: true });
|
|
12900
|
+
const entries = await readdir4(src, { withFileTypes: true });
|
|
12901
|
+
for (const entry of entries) {
|
|
12902
|
+
const srcPath = path36.join(src, entry.name);
|
|
12903
|
+
const destPath = path36.join(dest, entry.name);
|
|
12904
|
+
if (entry.name === ".git") {
|
|
12905
|
+
continue;
|
|
12906
|
+
}
|
|
12907
|
+
if (entry.isDirectory()) {
|
|
12908
|
+
if (skipDirs?.has(entry.name)) {
|
|
12711
12909
|
continue;
|
|
12712
12910
|
}
|
|
12713
|
-
|
|
12911
|
+
await copyDirectoryRecursive2(srcPath, destPath, skipDirs);
|
|
12912
|
+
} else {
|
|
12913
|
+
await cp2(srcPath, destPath, { preserveTimestamps: true, force: true });
|
|
12714
12914
|
}
|
|
12715
12915
|
}
|
|
12716
|
-
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
12717
12916
|
}
|
|
12718
|
-
|
|
12719
|
-
|
|
12720
|
-
|
|
12721
|
-
|
|
12722
|
-
}
|
|
12723
|
-
}
|
|
12724
|
-
var RepoManager = class {
|
|
12725
|
-
cacheDir;
|
|
12726
|
-
verbose;
|
|
12727
|
-
constructor(cacheDir, verbose = false) {
|
|
12728
|
-
this.cacheDir = cacheDir ?? getGitCacheRoot();
|
|
12729
|
-
this.verbose = verbose;
|
|
12917
|
+
var WorkspacePoolManager = class {
|
|
12918
|
+
poolRoot;
|
|
12919
|
+
constructor(poolRoot) {
|
|
12920
|
+
this.poolRoot = poolRoot ?? getWorkspacePoolRoot();
|
|
12730
12921
|
}
|
|
12731
|
-
|
|
12732
|
-
|
|
12733
|
-
|
|
12734
|
-
|
|
12735
|
-
|
|
12922
|
+
/**
|
|
12923
|
+
* Acquire a workspace slot from the pool.
|
|
12924
|
+
*
|
|
12925
|
+
* 1. Compute fingerprint from template + repos
|
|
12926
|
+
* 2. Check drift (compare stored metadata.json fingerprint vs computed)
|
|
12927
|
+
* 3. If drift: warn, remove all slots, rematerialize
|
|
12928
|
+
* 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
|
|
12929
|
+
* 5. If slot exists: reset repos, re-copy template files (skip repo directories)
|
|
12930
|
+
* 6. If new slot: copy template, materialize all repos, write metadata.json
|
|
12931
|
+
* 7. Return the slot (with path, index, isExisting)
|
|
12932
|
+
*/
|
|
12933
|
+
async acquireWorkspace(options) {
|
|
12934
|
+
const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
|
|
12935
|
+
const fingerprint = computeWorkspaceFingerprint(templatePath, repos);
|
|
12936
|
+
const poolDir = path36.join(this.poolRoot, fingerprint);
|
|
12937
|
+
await mkdir11(poolDir, { recursive: true });
|
|
12938
|
+
const drifted = await this.checkDrift(poolDir, fingerprint);
|
|
12939
|
+
if (drifted) {
|
|
12940
|
+
console.warn(
|
|
12941
|
+
`[workspace-pool] Drift detected for fingerprint ${fingerprint.slice(0, 12)}... Removing stale slots.`
|
|
12736
12942
|
);
|
|
12943
|
+
await this.removeAllSlots(poolDir);
|
|
12737
12944
|
}
|
|
12738
|
-
|
|
12739
|
-
const
|
|
12740
|
-
|
|
12741
|
-
|
|
12742
|
-
|
|
12743
|
-
|
|
12945
|
+
for (let i = 0; i < maxSlots; i++) {
|
|
12946
|
+
const slotPath = path36.join(poolDir, `slot-${i}`);
|
|
12947
|
+
const lockPath = `${slotPath}.lock`;
|
|
12948
|
+
const locked = await this.tryLock(lockPath);
|
|
12949
|
+
if (!locked) {
|
|
12950
|
+
continue;
|
|
12744
12951
|
}
|
|
12745
|
-
|
|
12746
|
-
|
|
12747
|
-
|
|
12748
|
-
|
|
12749
|
-
|
|
12750
|
-
|
|
12751
|
-
|
|
12952
|
+
const slotExists = existsSync2(slotPath);
|
|
12953
|
+
if (slotExists) {
|
|
12954
|
+
await this.resetSlot(slotPath, templatePath, repos, poolReset);
|
|
12955
|
+
return {
|
|
12956
|
+
index: i,
|
|
12957
|
+
path: slotPath,
|
|
12958
|
+
isExisting: true,
|
|
12959
|
+
lockPath,
|
|
12960
|
+
fingerprint,
|
|
12961
|
+
poolDir
|
|
12962
|
+
};
|
|
12752
12963
|
}
|
|
12753
|
-
|
|
12964
|
+
await mkdir11(slotPath, { recursive: true });
|
|
12965
|
+
if (templatePath) {
|
|
12966
|
+
await copyDirectoryRecursive2(templatePath, slotPath);
|
|
12967
|
+
}
|
|
12968
|
+
if (repos.length > 0) {
|
|
12969
|
+
await repoManager.materializeAll(repos, slotPath);
|
|
12970
|
+
}
|
|
12971
|
+
await this.writeMetadata(poolDir, fingerprint, templatePath ?? null, repos);
|
|
12972
|
+
return {
|
|
12973
|
+
index: i,
|
|
12974
|
+
path: slotPath,
|
|
12975
|
+
isExisting: false,
|
|
12976
|
+
lockPath,
|
|
12977
|
+
fingerprint,
|
|
12978
|
+
poolDir
|
|
12979
|
+
};
|
|
12980
|
+
}
|
|
12981
|
+
throw new Error(
|
|
12982
|
+
`All ${maxSlots} pool slots are locked for fingerprint ${fingerprint.slice(0, 12)}...`
|
|
12983
|
+
);
|
|
12984
|
+
}
|
|
12985
|
+
/** Remove lock file to release a slot. */
|
|
12986
|
+
async releaseSlot(slot) {
|
|
12987
|
+
try {
|
|
12988
|
+
await unlink(slot.lockPath);
|
|
12989
|
+
} catch {
|
|
12754
12990
|
}
|
|
12755
12991
|
}
|
|
12756
12992
|
/**
|
|
12757
|
-
*
|
|
12758
|
-
*
|
|
12759
|
-
* Returns
|
|
12993
|
+
* Try to acquire a PID-based lock file.
|
|
12994
|
+
* On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
|
|
12995
|
+
* Returns true if lock acquired, false if slot is actively locked.
|
|
12996
|
+
* Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
|
|
12760
12997
|
*/
|
|
12761
|
-
async
|
|
12762
|
-
|
|
12763
|
-
|
|
12764
|
-
|
|
12765
|
-
|
|
12766
|
-
|
|
12767
|
-
|
|
12768
|
-
|
|
12769
|
-
|
|
12998
|
+
async tryLock(lockPath) {
|
|
12999
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
13000
|
+
try {
|
|
13001
|
+
await writeFile7(lockPath, String(process.pid), { flag: "wx" });
|
|
13002
|
+
return true;
|
|
13003
|
+
} catch (err) {
|
|
13004
|
+
if (err.code !== "EEXIST") {
|
|
13005
|
+
throw err;
|
|
13006
|
+
}
|
|
13007
|
+
try {
|
|
13008
|
+
const pidStr = await readFile11(lockPath, "utf-8");
|
|
13009
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
13010
|
+
if (!Number.isNaN(pid)) {
|
|
13011
|
+
try {
|
|
13012
|
+
process.kill(pid, 0);
|
|
13013
|
+
return false;
|
|
13014
|
+
} catch {
|
|
13015
|
+
await unlink(lockPath).catch(() => {
|
|
13016
|
+
});
|
|
13017
|
+
continue;
|
|
13018
|
+
}
|
|
13019
|
+
}
|
|
13020
|
+
} catch {
|
|
13021
|
+
}
|
|
13022
|
+
return false;
|
|
13023
|
+
}
|
|
12770
13024
|
}
|
|
12771
|
-
|
|
12772
|
-
|
|
12773
|
-
|
|
12774
|
-
|
|
13025
|
+
return false;
|
|
13026
|
+
}
|
|
13027
|
+
/**
|
|
13028
|
+
* Check if the stored fingerprint in metadata.json differs from the computed one.
|
|
13029
|
+
* Returns true if drifted, false otherwise.
|
|
13030
|
+
* Returns false (no drift) if metadata.json doesn't exist (first use).
|
|
13031
|
+
*/
|
|
13032
|
+
async checkDrift(poolDir, fingerprint) {
|
|
13033
|
+
const metadataPath = path36.join(poolDir, "metadata.json");
|
|
13034
|
+
try {
|
|
13035
|
+
const raw = await readFile11(metadataPath, "utf-8");
|
|
13036
|
+
const metadata = JSON.parse(raw);
|
|
13037
|
+
return metadata.fingerprint !== fingerprint;
|
|
13038
|
+
} catch {
|
|
13039
|
+
return false;
|
|
13040
|
+
}
|
|
13041
|
+
}
|
|
13042
|
+
/** Write metadata.json with fingerprint, inputs, and timestamp. */
|
|
13043
|
+
async writeMetadata(poolDir, fingerprint, templatePath, repos) {
|
|
13044
|
+
const metadata = {
|
|
13045
|
+
fingerprint,
|
|
13046
|
+
templatePath,
|
|
13047
|
+
repos,
|
|
13048
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
13049
|
+
};
|
|
13050
|
+
await writeFile7(path36.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
13051
|
+
}
|
|
13052
|
+
/** Remove all slot directories and their lock files from a pool directory. */
|
|
13053
|
+
async removeAllSlots(poolDir) {
|
|
13054
|
+
const entries = await readdir4(poolDir);
|
|
13055
|
+
for (const entry of entries) {
|
|
13056
|
+
if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
|
|
13057
|
+
const lockPath = path36.join(poolDir, `${entry}.lock`);
|
|
13058
|
+
if (existsSync2(lockPath)) {
|
|
13059
|
+
try {
|
|
13060
|
+
const pidStr = await readFile11(lockPath, "utf-8");
|
|
13061
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
13062
|
+
if (!Number.isNaN(pid)) {
|
|
13063
|
+
try {
|
|
13064
|
+
process.kill(pid, 0);
|
|
13065
|
+
console.warn(`[workspace-pool] Skipping slot ${entry}: locked by PID ${pid}`);
|
|
13066
|
+
continue;
|
|
13067
|
+
} catch {
|
|
13068
|
+
}
|
|
13069
|
+
}
|
|
13070
|
+
} catch {
|
|
13071
|
+
}
|
|
12775
13072
|
}
|
|
12776
|
-
|
|
13073
|
+
await rm5(path36.join(poolDir, entry), { recursive: true, force: true });
|
|
13074
|
+
await rm5(lockPath, { force: true }).catch(() => {
|
|
13075
|
+
});
|
|
12777
13076
|
}
|
|
12778
|
-
|
|
12779
|
-
|
|
12780
|
-
|
|
13077
|
+
}
|
|
13078
|
+
await rm5(path36.join(poolDir, "metadata.json"), { force: true }).catch(() => {
|
|
13079
|
+
});
|
|
13080
|
+
}
|
|
13081
|
+
/**
|
|
13082
|
+
* Reset an existing slot for reuse:
|
|
13083
|
+
* 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
|
|
13084
|
+
* 2. Re-copy template files (skip repo directories)
|
|
13085
|
+
*/
|
|
13086
|
+
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
13087
|
+
for (const repo of repos) {
|
|
13088
|
+
const repoDir = path36.join(slotPath, repo.path);
|
|
13089
|
+
if (!existsSync2(repoDir)) {
|
|
13090
|
+
continue;
|
|
13091
|
+
}
|
|
13092
|
+
if (poolReset === "none") {
|
|
13093
|
+
continue;
|
|
13094
|
+
}
|
|
13095
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
13096
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
13097
|
+
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
13098
|
+
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
13099
|
+
}
|
|
13100
|
+
if (templatePath) {
|
|
13101
|
+
const repoDirNames = new Set(
|
|
13102
|
+
repos.map((r) => {
|
|
13103
|
+
const normalized = r.path.replace(/^\.\//, "");
|
|
13104
|
+
return normalized.split("/")[0];
|
|
13105
|
+
})
|
|
12781
13106
|
);
|
|
13107
|
+
await copyDirectoryRecursive2(templatePath, slotPath, repoDirNames);
|
|
12782
13108
|
}
|
|
12783
|
-
|
|
12784
|
-
|
|
12785
|
-
|
|
13109
|
+
}
|
|
13110
|
+
};
|
|
13111
|
+
|
|
13112
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
13113
|
+
import { execFile as execFile2 } from "node:child_process";
|
|
13114
|
+
import path37 from "node:path";
|
|
13115
|
+
import { promisify as promisify6 } from "node:util";
|
|
13116
|
+
var execFileAsync2 = promisify6(execFile2);
|
|
13117
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
13118
|
+
function gitEnv2() {
|
|
13119
|
+
const env = { ...process.env };
|
|
13120
|
+
for (const key of Object.keys(env)) {
|
|
13121
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
13122
|
+
delete env[key];
|
|
13123
|
+
}
|
|
13124
|
+
}
|
|
13125
|
+
return {
|
|
13126
|
+
...env,
|
|
13127
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
13128
|
+
GIT_ASKPASS: "",
|
|
13129
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
13130
|
+
};
|
|
13131
|
+
}
|
|
13132
|
+
function getSourceUrl(source) {
|
|
13133
|
+
return source.type === "git" ? source.url : source.path;
|
|
13134
|
+
}
|
|
13135
|
+
async function git2(args, opts) {
|
|
13136
|
+
const { stdout } = await execFileAsync2("git", args, {
|
|
13137
|
+
cwd: opts?.cwd,
|
|
13138
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
13139
|
+
env: gitEnv2(),
|
|
13140
|
+
maxBuffer: 50 * 1024 * 1024
|
|
13141
|
+
// 50MB
|
|
13142
|
+
});
|
|
13143
|
+
return stdout.trim();
|
|
13144
|
+
}
|
|
13145
|
+
var RepoManager = class {
|
|
13146
|
+
verbose;
|
|
13147
|
+
constructor(verbose = false) {
|
|
13148
|
+
this.verbose = verbose;
|
|
13149
|
+
}
|
|
13150
|
+
async runGit(args, opts) {
|
|
13151
|
+
const startedAt = Date.now();
|
|
12786
13152
|
if (this.verbose) {
|
|
12787
|
-
console.log(
|
|
12788
|
-
`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`
|
|
12789
|
-
);
|
|
13153
|
+
console.log(`[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`);
|
|
12790
13154
|
}
|
|
12791
13155
|
try {
|
|
12792
|
-
|
|
12793
|
-
|
|
12794
|
-
|
|
12795
|
-
}
|
|
12796
|
-
const fetchArgs = ["fetch", "--prune"];
|
|
12797
|
-
if (depth) {
|
|
12798
|
-
fetchArgs.push("--depth", String(depth));
|
|
12799
|
-
}
|
|
12800
|
-
await this.runGit(fetchArgs, { cwd: cachePath });
|
|
12801
|
-
} else {
|
|
12802
|
-
if (this.verbose) {
|
|
12803
|
-
console.log(`[repo] creating new cache ${cachePath}`);
|
|
12804
|
-
}
|
|
12805
|
-
const cloneArgs = ["clone", "--mirror", "--bare"];
|
|
12806
|
-
if (depth) {
|
|
12807
|
-
cloneArgs.push("--depth", String(depth));
|
|
12808
|
-
}
|
|
12809
|
-
const sourceUrl = getSourceUrl(source);
|
|
12810
|
-
const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
12811
|
-
cloneArgs.push(cloneUrl, cachePath);
|
|
12812
|
-
await this.runGit(cloneArgs);
|
|
13156
|
+
const output = await git2(args, opts);
|
|
13157
|
+
if (this.verbose) {
|
|
13158
|
+
console.log(`[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`);
|
|
12813
13159
|
}
|
|
12814
|
-
|
|
12815
|
-
|
|
13160
|
+
return output;
|
|
13161
|
+
} catch (error) {
|
|
12816
13162
|
if (this.verbose) {
|
|
12817
|
-
|
|
13163
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13164
|
+
console.log(
|
|
13165
|
+
`[repo] git fail durationMs=${Date.now() - startedAt} args=${args.join(" ")} error=${message}`
|
|
13166
|
+
);
|
|
12818
13167
|
}
|
|
13168
|
+
throw error;
|
|
12819
13169
|
}
|
|
12820
|
-
return cachePath;
|
|
12821
13170
|
}
|
|
12822
13171
|
/**
|
|
12823
|
-
* Clone a repo from
|
|
13172
|
+
* Clone a repo directly from source into the workspace at the configured path.
|
|
12824
13173
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
12825
13174
|
*/
|
|
12826
13175
|
async materialize(repo, workspacePath) {
|
|
12827
|
-
const targetDir =
|
|
13176
|
+
const targetDir = path37.join(workspacePath, repo.path);
|
|
13177
|
+
const sourceUrl = getSourceUrl(repo.source);
|
|
12828
13178
|
const startedAt = Date.now();
|
|
12829
13179
|
if (this.verbose) {
|
|
12830
13180
|
console.log(
|
|
12831
|
-
`[repo] materialize start path=${repo.path} source=${
|
|
13181
|
+
`[repo] materialize start path=${repo.path} source=${sourceUrl} workspace=${workspacePath}`
|
|
12832
13182
|
);
|
|
12833
13183
|
}
|
|
12834
|
-
const cachePath = await this.ensureCache(
|
|
12835
|
-
repo.source,
|
|
12836
|
-
repo.clone?.depth,
|
|
12837
|
-
repo.checkout?.resolve
|
|
12838
|
-
);
|
|
12839
13184
|
const cloneArgs = ["clone"];
|
|
12840
13185
|
if (repo.clone?.depth) {
|
|
12841
13186
|
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
@@ -12844,7 +13189,7 @@ var RepoManager = class {
|
|
|
12844
13189
|
cloneArgs.push("--filter", repo.clone.filter);
|
|
12845
13190
|
}
|
|
12846
13191
|
cloneArgs.push("--no-checkout");
|
|
12847
|
-
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${
|
|
13192
|
+
const cloneUrl = (repo.clone?.depth || repo.clone?.filter) && repo.source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
12848
13193
|
cloneArgs.push(cloneUrl, targetDir);
|
|
12849
13194
|
await this.runGit(cloneArgs);
|
|
12850
13195
|
if (repo.clone?.sparse?.length) {
|
|
@@ -12916,85 +13261,47 @@ var RepoManager = class {
|
|
|
12916
13261
|
}
|
|
12917
13262
|
}
|
|
12918
13263
|
/** Reset repos in workspace to their checkout state. */
|
|
12919
|
-
async reset(repos, workspacePath,
|
|
12920
|
-
|
|
12921
|
-
for (const repo of repos) {
|
|
12922
|
-
const targetDir = path36.join(workspacePath, repo.path);
|
|
12923
|
-
await rm5(targetDir, { recursive: true, force: true });
|
|
12924
|
-
}
|
|
12925
|
-
await this.materializeAll(repos, workspacePath);
|
|
12926
|
-
return;
|
|
12927
|
-
}
|
|
13264
|
+
async reset(repos, workspacePath, reset) {
|
|
13265
|
+
const cleanFlag = reset === "strict" ? "-fdx" : "-fd";
|
|
12928
13266
|
for (const repo of repos) {
|
|
12929
|
-
const targetDir =
|
|
13267
|
+
const targetDir = path37.join(workspacePath, repo.path);
|
|
12930
13268
|
await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
12931
|
-
await this.runGit(["clean",
|
|
13269
|
+
await this.runGit(["clean", cleanFlag], { cwd: targetDir });
|
|
12932
13270
|
}
|
|
12933
13271
|
}
|
|
12934
|
-
/**
|
|
12935
|
-
* Seed the cache from a local repository, setting the remote to a given URL.
|
|
12936
|
-
* Useful for avoiding slow network clones when a local clone already exists.
|
|
12937
|
-
*/
|
|
12938
|
-
async seedCache(localPath, remoteUrl, opts) {
|
|
12939
|
-
const source = { type: "git", url: remoteUrl };
|
|
12940
|
-
const key = cacheKey(source);
|
|
12941
|
-
const cachePath = path36.join(this.cacheDir, key);
|
|
12942
|
-
const lockPath = `${cachePath}.lock`;
|
|
12943
|
-
await mkdir11(this.cacheDir, { recursive: true });
|
|
12944
|
-
await acquireLock(lockPath);
|
|
12945
|
-
try {
|
|
12946
|
-
if (existsSync2(path36.join(cachePath, "HEAD"))) {
|
|
12947
|
-
if (!opts?.force) {
|
|
12948
|
-
throw new Error(
|
|
12949
|
-
`Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
|
|
12950
|
-
);
|
|
12951
|
-
}
|
|
12952
|
-
await rm5(cachePath, { recursive: true, force: true });
|
|
12953
|
-
}
|
|
12954
|
-
await git(["clone", "--mirror", "--bare", localPath, cachePath]);
|
|
12955
|
-
await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
|
|
12956
|
-
} finally {
|
|
12957
|
-
await releaseLock(lockPath);
|
|
12958
|
-
}
|
|
12959
|
-
return cachePath;
|
|
12960
|
-
}
|
|
12961
|
-
/** Remove the entire cache directory. */
|
|
12962
|
-
async cleanCache() {
|
|
12963
|
-
await rm5(this.cacheDir, { recursive: true, force: true });
|
|
12964
|
-
}
|
|
12965
13272
|
};
|
|
12966
13273
|
|
|
12967
13274
|
// src/evaluation/workspace/resolve.ts
|
|
12968
|
-
import { readdir as
|
|
12969
|
-
import
|
|
13275
|
+
import { readdir as readdir5, stat as stat6 } from "node:fs/promises";
|
|
13276
|
+
import path38 from "node:path";
|
|
12970
13277
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
12971
13278
|
if (!templatePath) {
|
|
12972
13279
|
return void 0;
|
|
12973
13280
|
}
|
|
12974
|
-
const resolved =
|
|
13281
|
+
const resolved = path38.resolve(templatePath);
|
|
12975
13282
|
const stats = await stat6(resolved);
|
|
12976
13283
|
if (stats.isFile()) {
|
|
12977
13284
|
return {
|
|
12978
|
-
dir:
|
|
13285
|
+
dir: path38.dirname(resolved),
|
|
12979
13286
|
workspaceFile: resolved
|
|
12980
13287
|
};
|
|
12981
13288
|
}
|
|
12982
13289
|
if (!stats.isDirectory()) {
|
|
12983
13290
|
throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
|
|
12984
13291
|
}
|
|
12985
|
-
const entries = await
|
|
13292
|
+
const entries = await readdir5(resolved);
|
|
12986
13293
|
const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
|
|
12987
13294
|
if (workspaceFiles.length === 1) {
|
|
12988
13295
|
return {
|
|
12989
13296
|
dir: resolved,
|
|
12990
|
-
workspaceFile:
|
|
13297
|
+
workspaceFile: path38.join(resolved, workspaceFiles[0])
|
|
12991
13298
|
};
|
|
12992
13299
|
}
|
|
12993
13300
|
if (workspaceFiles.length > 1) {
|
|
12994
13301
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
12995
13302
|
return {
|
|
12996
13303
|
dir: resolved,
|
|
12997
|
-
workspaceFile: conventionFile ?
|
|
13304
|
+
workspaceFile: conventionFile ? path38.join(resolved, conventionFile) : void 0
|
|
12998
13305
|
};
|
|
12999
13306
|
}
|
|
13000
13307
|
return { dir: resolved };
|
|
@@ -13046,6 +13353,22 @@ function classifyQualityStatus(score) {
|
|
|
13046
13353
|
function usesFileReferencePrompt(provider) {
|
|
13047
13354
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
13048
13355
|
}
|
|
13356
|
+
function toScriptConfig(hook, hookName, context) {
|
|
13357
|
+
const command = hook.command ?? hook.script;
|
|
13358
|
+
if (!command || command.length === 0) {
|
|
13359
|
+
throw new Error(`${hookName} hook in ${context} requires command or script`);
|
|
13360
|
+
}
|
|
13361
|
+
return {
|
|
13362
|
+
command,
|
|
13363
|
+
...hook.timeout_ms !== void 0 && { timeout_ms: hook.timeout_ms },
|
|
13364
|
+
...hook.timeoutMs !== void 0 && { timeoutMs: hook.timeoutMs },
|
|
13365
|
+
...hook.cwd !== void 0 && { cwd: hook.cwd },
|
|
13366
|
+
...hook.script !== void 0 && { script: hook.script }
|
|
13367
|
+
};
|
|
13368
|
+
}
|
|
13369
|
+
function hasHookCommand(hook) {
|
|
13370
|
+
return !!(hook?.command && hook.command.length > 0 || hook?.script && hook.script.length > 0);
|
|
13371
|
+
}
|
|
13049
13372
|
function getWorkspaceTemplate(target) {
|
|
13050
13373
|
const config = target.config;
|
|
13051
13374
|
if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
|
|
@@ -13076,7 +13399,15 @@ async function runEvaluation(options) {
|
|
|
13076
13399
|
trials,
|
|
13077
13400
|
streamCallbacks,
|
|
13078
13401
|
totalBudgetUsd,
|
|
13079
|
-
failOnError
|
|
13402
|
+
failOnError,
|
|
13403
|
+
poolWorkspaces,
|
|
13404
|
+
poolMaxSlots: configPoolMaxSlots,
|
|
13405
|
+
workspace: legacyWorkspacePath,
|
|
13406
|
+
workspaceMode,
|
|
13407
|
+
workspacePath,
|
|
13408
|
+
workspaceClean,
|
|
13409
|
+
retainOnSuccess,
|
|
13410
|
+
retainOnFailure
|
|
13080
13411
|
} = options;
|
|
13081
13412
|
let useCache = options.useCache;
|
|
13082
13413
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -13150,7 +13481,7 @@ async function runEvaluation(options) {
|
|
|
13150
13481
|
];
|
|
13151
13482
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
13152
13483
|
const typeRegistry = createBuiltinRegistry();
|
|
13153
|
-
const discoveryBaseDir = evalFilePath ?
|
|
13484
|
+
const discoveryBaseDir = evalFilePath ? path39.dirname(path39.resolve(evalFilePath)) : process.cwd();
|
|
13154
13485
|
const evalDir = discoveryBaseDir;
|
|
13155
13486
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
13156
13487
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
@@ -13212,13 +13543,29 @@ async function runEvaluation(options) {
|
|
|
13212
13543
|
}
|
|
13213
13544
|
};
|
|
13214
13545
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
13215
|
-
const
|
|
13546
|
+
const configuredMode = suiteWorkspace?.mode ?? workspaceMode;
|
|
13547
|
+
const configuredStaticPath = suiteWorkspace?.static_path ?? workspacePath ?? legacyWorkspacePath;
|
|
13548
|
+
const useStaticWorkspace = configuredMode === "static" || !!configuredStaticPath && !configuredMode;
|
|
13549
|
+
if (useStaticWorkspace && isPerTestIsolation) {
|
|
13550
|
+
throw new Error(
|
|
13551
|
+
"static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
|
|
13552
|
+
);
|
|
13553
|
+
}
|
|
13554
|
+
if (configuredMode === "static" && !configuredStaticPath) {
|
|
13555
|
+
throw new Error("workspace.mode=static requires workspace.static_path or --workspace-path");
|
|
13556
|
+
}
|
|
13557
|
+
const hasSharedWorkspace = !!(useStaticWorkspace || workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
13558
|
+
const poolEnabled = configuredMode === "pooled" ? true : configuredMode === "ephemeral" || useStaticWorkspace ? false : suiteWorkspace?.pool ?? poolWorkspaces ?? true;
|
|
13559
|
+
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
13560
|
+
const finishCleanPolicy = suiteWorkspace?.hooks?.on_finish?.clean;
|
|
13561
|
+
const resolvedRetainOnSuccess = (finishCleanPolicy === "always" || finishCleanPolicy === "on_success" ? "cleanup" : finishCleanPolicy === "on_failure" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
13562
|
+
const resolvedRetainOnFailure = (finishCleanPolicy === "always" || finishCleanPolicy === "on_failure" ? "cleanup" : finishCleanPolicy === "on_success" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
13216
13563
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
13217
|
-
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
13564
|
+
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
13218
13565
|
setupLog(
|
|
13219
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
13566
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
13220
13567
|
);
|
|
13221
|
-
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
13568
|
+
if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
|
|
13222
13569
|
console.warn(
|
|
13223
13570
|
`Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
|
|
13224
13571
|
);
|
|
@@ -13227,7 +13574,38 @@ async function runEvaluation(options) {
|
|
|
13227
13574
|
let sharedWorkspacePath;
|
|
13228
13575
|
let sharedBaselineCommit;
|
|
13229
13576
|
let beforeAllOutput;
|
|
13230
|
-
|
|
13577
|
+
let poolManager;
|
|
13578
|
+
let poolSlot;
|
|
13579
|
+
const poolSlots = [];
|
|
13580
|
+
const availablePoolSlots = [];
|
|
13581
|
+
const poolSlotBaselines = /* @__PURE__ */ new Map();
|
|
13582
|
+
const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
|
|
13583
|
+
if (useStaticWorkspace && configuredStaticPath) {
|
|
13584
|
+
sharedWorkspacePath = configuredStaticPath;
|
|
13585
|
+
setupLog(`using static workspace: ${configuredStaticPath}`);
|
|
13586
|
+
} else if (usePool && suiteWorkspace?.repos) {
|
|
13587
|
+
const slotsNeeded = workers;
|
|
13588
|
+
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
|
|
13589
|
+
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
|
|
13590
|
+
const poolRepoManager = new RepoManager(verbose);
|
|
13591
|
+
for (let i = 0; i < slotsNeeded; i++) {
|
|
13592
|
+
const slot = await poolManager.acquireWorkspace({
|
|
13593
|
+
templatePath: workspaceTemplate,
|
|
13594
|
+
repos: suiteWorkspace.repos,
|
|
13595
|
+
maxSlots: poolMaxSlots,
|
|
13596
|
+
repoManager: poolRepoManager,
|
|
13597
|
+
poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? suiteWorkspace.hooks?.on_reuse?.reset ?? "fast"
|
|
13598
|
+
});
|
|
13599
|
+
poolSlots.push(slot);
|
|
13600
|
+
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
13601
|
+
}
|
|
13602
|
+
if (slotsNeeded === 1) {
|
|
13603
|
+
poolSlot = poolSlots[0];
|
|
13604
|
+
sharedWorkspacePath = poolSlot.path;
|
|
13605
|
+
} else {
|
|
13606
|
+
availablePoolSlots.push(...poolSlots);
|
|
13607
|
+
}
|
|
13608
|
+
} else if (workspaceTemplate) {
|
|
13231
13609
|
setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
|
|
13232
13610
|
try {
|
|
13233
13611
|
sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
|
|
@@ -13236,288 +13614,359 @@ async function runEvaluation(options) {
|
|
|
13236
13614
|
const message = error instanceof Error ? error.message : String(error);
|
|
13237
13615
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
13238
13616
|
}
|
|
13617
|
+
} else if (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
13618
|
+
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
13619
|
+
await mkdir12(sharedWorkspacePath, { recursive: true });
|
|
13620
|
+
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
13621
|
+
}
|
|
13622
|
+
try {
|
|
13239
13623
|
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
13240
|
-
const copiedWorkspaceFile =
|
|
13624
|
+
const copiedWorkspaceFile = path39.join(sharedWorkspacePath, path39.basename(suiteWorkspaceFile));
|
|
13241
13625
|
try {
|
|
13242
13626
|
await stat7(copiedWorkspaceFile);
|
|
13243
13627
|
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
13244
13628
|
} catch {
|
|
13245
13629
|
}
|
|
13246
13630
|
}
|
|
13247
|
-
|
|
13248
|
-
sharedWorkspacePath
|
|
13249
|
-
|
|
13250
|
-
|
|
13251
|
-
|
|
13252
|
-
|
|
13253
|
-
|
|
13254
|
-
|
|
13255
|
-
|
|
13256
|
-
|
|
13257
|
-
|
|
13258
|
-
|
|
13259
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
13260
|
-
if (sharedWorkspacePath) {
|
|
13261
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13262
|
-
});
|
|
13263
|
-
}
|
|
13264
|
-
throw new Error(`Failed to materialize repos: ${message}`);
|
|
13265
|
-
}
|
|
13266
|
-
}
|
|
13267
|
-
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
13268
|
-
const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
|
|
13269
|
-
setupLog(
|
|
13270
|
-
`running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
13271
|
-
);
|
|
13272
|
-
const scriptContext = {
|
|
13273
|
-
workspacePath: sharedWorkspacePath,
|
|
13274
|
-
testId: "__before_all__",
|
|
13275
|
-
evalRunId,
|
|
13276
|
-
evalDir
|
|
13277
|
-
};
|
|
13278
|
-
try {
|
|
13279
|
-
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
13280
|
-
setupLog("shared before_all completed");
|
|
13281
|
-
} catch (error) {
|
|
13282
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
13283
|
-
if (sharedWorkspacePath) {
|
|
13284
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13285
|
-
});
|
|
13286
|
-
}
|
|
13287
|
-
throw new Error(`before_all script failed: ${message}`);
|
|
13288
|
-
}
|
|
13289
|
-
}
|
|
13290
|
-
if (sharedWorkspacePath) {
|
|
13291
|
-
try {
|
|
13292
|
-
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
13293
|
-
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
13294
|
-
} catch {
|
|
13295
|
-
setupLog("shared baseline initialization skipped (non-fatal)");
|
|
13296
|
-
}
|
|
13297
|
-
}
|
|
13298
|
-
let nextWorkerId = 1;
|
|
13299
|
-
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
13300
|
-
let beforeAllOutputAttached = false;
|
|
13301
|
-
let cumulativeBudgetCost = 0;
|
|
13302
|
-
let budgetExhausted = false;
|
|
13303
|
-
let failOnErrorTriggered = false;
|
|
13304
|
-
const promises = filteredEvalCases.map(
|
|
13305
|
-
(evalCase) => limit(async () => {
|
|
13306
|
-
const workerId = nextWorkerId++;
|
|
13307
|
-
workerIdByEvalId.set(evalCase.id, workerId);
|
|
13308
|
-
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
13309
|
-
const budgetResult = {
|
|
13310
|
-
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13311
|
-
testId: evalCase.id,
|
|
13312
|
-
dataset: evalCase.dataset,
|
|
13313
|
-
score: 0,
|
|
13314
|
-
hits: [],
|
|
13315
|
-
misses: [],
|
|
13316
|
-
answer: "",
|
|
13317
|
-
target: target.name,
|
|
13318
|
-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13319
|
-
budgetExceeded: true,
|
|
13320
|
-
executionStatus: "execution_error",
|
|
13321
|
-
failureStage: "setup",
|
|
13322
|
-
failureReasonCode: "budget_exceeded",
|
|
13323
|
-
executionError: {
|
|
13324
|
-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13325
|
-
stage: "setup"
|
|
13326
|
-
}
|
|
13327
|
-
};
|
|
13328
|
-
if (onProgress) {
|
|
13329
|
-
await onProgress({
|
|
13330
|
-
workerId,
|
|
13331
|
-
testId: evalCase.id,
|
|
13332
|
-
status: "failed",
|
|
13333
|
-
completedAt: Date.now(),
|
|
13334
|
-
error: budgetResult.error
|
|
13631
|
+
const repoManager = suiteWorkspace?.repos?.length && !usePool && !useStaticWorkspace ? new RepoManager(verbose) : void 0;
|
|
13632
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
13633
|
+
setupLog(
|
|
13634
|
+
`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
|
|
13635
|
+
);
|
|
13636
|
+
try {
|
|
13637
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
13638
|
+
setupLog("shared repo materialization complete");
|
|
13639
|
+
} catch (error) {
|
|
13640
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13641
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
13642
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13335
13643
|
});
|
|
13336
13644
|
}
|
|
13337
|
-
|
|
13338
|
-
await onResult(budgetResult);
|
|
13339
|
-
}
|
|
13340
|
-
return budgetResult;
|
|
13645
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
13341
13646
|
}
|
|
13342
|
-
|
|
13343
|
-
|
|
13344
|
-
|
|
13345
|
-
|
|
13346
|
-
|
|
13347
|
-
|
|
13348
|
-
|
|
13349
|
-
|
|
13350
|
-
|
|
13351
|
-
|
|
13352
|
-
|
|
13353
|
-
|
|
13354
|
-
|
|
13355
|
-
|
|
13356
|
-
|
|
13357
|
-
|
|
13358
|
-
|
|
13359
|
-
|
|
13360
|
-
|
|
13361
|
-
|
|
13362
|
-
|
|
13363
|
-
|
|
13364
|
-
|
|
13365
|
-
|
|
13647
|
+
}
|
|
13648
|
+
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all_tests;
|
|
13649
|
+
if (sharedWorkspacePath && hasHookCommand(suiteBeforeAllHook)) {
|
|
13650
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
13651
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
13652
|
+
setupLog(
|
|
13653
|
+
`running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
13654
|
+
);
|
|
13655
|
+
const scriptContext = {
|
|
13656
|
+
workspacePath: sharedWorkspacePath,
|
|
13657
|
+
testId: "__before_all__",
|
|
13658
|
+
evalRunId,
|
|
13659
|
+
evalDir
|
|
13660
|
+
};
|
|
13661
|
+
try {
|
|
13662
|
+
beforeAllOutput = await executeWorkspaceScript(
|
|
13663
|
+
toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
|
|
13664
|
+
scriptContext
|
|
13665
|
+
);
|
|
13666
|
+
setupLog("shared before_all completed");
|
|
13667
|
+
} catch (error) {
|
|
13668
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13669
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
13670
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13366
13671
|
});
|
|
13367
13672
|
}
|
|
13368
|
-
|
|
13369
|
-
await onResult(haltResult);
|
|
13370
|
-
}
|
|
13371
|
-
return haltResult;
|
|
13372
|
-
}
|
|
13373
|
-
if (onProgress) {
|
|
13374
|
-
await onProgress({
|
|
13375
|
-
workerId,
|
|
13376
|
-
testId: evalCase.id,
|
|
13377
|
-
status: "running",
|
|
13378
|
-
startedAt: Date.now()
|
|
13379
|
-
});
|
|
13673
|
+
throw new Error(`before_all script failed: ${message}`);
|
|
13380
13674
|
}
|
|
13381
|
-
|
|
13382
|
-
|
|
13383
|
-
|
|
13384
|
-
|
|
13385
|
-
|
|
13386
|
-
|
|
13387
|
-
|
|
13388
|
-
|
|
13389
|
-
agentTimeoutMs,
|
|
13390
|
-
cache,
|
|
13391
|
-
useCache,
|
|
13392
|
-
now,
|
|
13393
|
-
judgeProvider,
|
|
13394
|
-
targetResolver,
|
|
13395
|
-
availableTargets,
|
|
13675
|
+
}
|
|
13676
|
+
if (availablePoolSlots.length > 0 && hasHookCommand(suiteBeforeAllHook)) {
|
|
13677
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
13678
|
+
for (const slot of availablePoolSlots) {
|
|
13679
|
+
setupLog(`running before_all on pool slot ${slot.index}`);
|
|
13680
|
+
const scriptContext = {
|
|
13681
|
+
workspacePath: slot.path,
|
|
13682
|
+
testId: "__before_all__",
|
|
13396
13683
|
evalRunId,
|
|
13397
|
-
keepWorkspaces,
|
|
13398
|
-
cleanupWorkspaces,
|
|
13399
|
-
sharedWorkspacePath,
|
|
13400
|
-
sharedBaselineCommit,
|
|
13401
|
-
suiteWorkspaceFile,
|
|
13402
|
-
streamCallbacks,
|
|
13403
|
-
typeRegistry,
|
|
13404
|
-
repoManager,
|
|
13405
13684
|
evalDir
|
|
13406
13685
|
};
|
|
13407
|
-
|
|
13408
|
-
|
|
13409
|
-
|
|
13410
|
-
|
|
13411
|
-
|
|
13412
|
-
|
|
13413
|
-
|
|
13686
|
+
try {
|
|
13687
|
+
const output = await executeWorkspaceScript(
|
|
13688
|
+
toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
|
|
13689
|
+
scriptContext
|
|
13690
|
+
);
|
|
13691
|
+
if (!beforeAllOutput) beforeAllOutput = output;
|
|
13692
|
+
setupLog(`before_all completed on pool slot ${slot.index}`);
|
|
13693
|
+
} catch (error) {
|
|
13694
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13695
|
+
throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
|
|
13696
|
+
}
|
|
13697
|
+
}
|
|
13698
|
+
}
|
|
13699
|
+
if (sharedWorkspacePath) {
|
|
13700
|
+
try {
|
|
13701
|
+
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
13702
|
+
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
13703
|
+
} catch {
|
|
13704
|
+
setupLog("shared baseline initialization skipped (non-fatal)");
|
|
13705
|
+
}
|
|
13706
|
+
}
|
|
13707
|
+
if (availablePoolSlots.length > 0) {
|
|
13708
|
+
for (const slot of availablePoolSlots) {
|
|
13709
|
+
try {
|
|
13710
|
+
const baseline = await initializeBaseline(slot.path);
|
|
13711
|
+
poolSlotBaselines.set(slot.path, baseline);
|
|
13712
|
+
setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
|
|
13713
|
+
} catch {
|
|
13714
|
+
setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`);
|
|
13715
|
+
}
|
|
13716
|
+
}
|
|
13717
|
+
}
|
|
13718
|
+
let nextWorkerId = 1;
|
|
13719
|
+
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
13720
|
+
let beforeAllOutputAttached = false;
|
|
13721
|
+
let cumulativeBudgetCost = 0;
|
|
13722
|
+
let budgetExhausted = false;
|
|
13723
|
+
let failOnErrorTriggered = false;
|
|
13724
|
+
const promises = filteredEvalCases.map(
|
|
13725
|
+
(evalCase) => limit(async () => {
|
|
13726
|
+
const workerId = nextWorkerId++;
|
|
13727
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
13728
|
+
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
13729
|
+
const budgetResult = {
|
|
13730
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13731
|
+
testId: evalCase.id,
|
|
13732
|
+
dataset: evalCase.dataset,
|
|
13733
|
+
score: 0,
|
|
13734
|
+
hits: [],
|
|
13735
|
+
misses: [],
|
|
13736
|
+
answer: "",
|
|
13737
|
+
target: target.name,
|
|
13738
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13739
|
+
budgetExceeded: true,
|
|
13740
|
+
executionStatus: "execution_error",
|
|
13741
|
+
failureStage: "setup",
|
|
13742
|
+
failureReasonCode: "budget_exceeded",
|
|
13743
|
+
executionError: {
|
|
13744
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13745
|
+
stage: "setup"
|
|
13414
13746
|
}
|
|
13415
|
-
}
|
|
13416
|
-
|
|
13747
|
+
};
|
|
13748
|
+
if (onProgress) {
|
|
13749
|
+
await onProgress({
|
|
13750
|
+
workerId,
|
|
13751
|
+
testId: evalCase.id,
|
|
13752
|
+
status: "failed",
|
|
13753
|
+
completedAt: Date.now(),
|
|
13754
|
+
error: budgetResult.error
|
|
13755
|
+
});
|
|
13417
13756
|
}
|
|
13418
|
-
if (
|
|
13419
|
-
|
|
13420
|
-
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
13421
|
-
budgetExhausted = true;
|
|
13422
|
-
}
|
|
13757
|
+
if (onResult) {
|
|
13758
|
+
await onResult(budgetResult);
|
|
13423
13759
|
}
|
|
13760
|
+
return budgetResult;
|
|
13424
13761
|
}
|
|
13425
|
-
if (failOnError === true &&
|
|
13426
|
-
|
|
13427
|
-
|
|
13428
|
-
|
|
13429
|
-
|
|
13430
|
-
|
|
13762
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
13763
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
13764
|
+
const haltResult = {
|
|
13765
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13766
|
+
testId: evalCase.id,
|
|
13767
|
+
dataset: evalCase.dataset,
|
|
13768
|
+
score: 0,
|
|
13769
|
+
hits: [],
|
|
13770
|
+
misses: [],
|
|
13771
|
+
answer: "",
|
|
13772
|
+
target: target.name,
|
|
13773
|
+
error: errorMsg,
|
|
13774
|
+
executionStatus: "execution_error",
|
|
13775
|
+
failureStage: "setup",
|
|
13776
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
13777
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
13778
|
+
};
|
|
13779
|
+
if (onProgress) {
|
|
13780
|
+
await onProgress({
|
|
13781
|
+
workerId,
|
|
13782
|
+
testId: evalCase.id,
|
|
13783
|
+
status: "failed",
|
|
13784
|
+
completedAt: Date.now(),
|
|
13785
|
+
error: haltResult.error
|
|
13786
|
+
});
|
|
13787
|
+
}
|
|
13788
|
+
if (onResult) {
|
|
13789
|
+
await onResult(haltResult);
|
|
13790
|
+
}
|
|
13791
|
+
return haltResult;
|
|
13431
13792
|
}
|
|
13432
13793
|
if (onProgress) {
|
|
13433
13794
|
await onProgress({
|
|
13434
13795
|
workerId,
|
|
13435
13796
|
testId: evalCase.id,
|
|
13436
|
-
status:
|
|
13437
|
-
startedAt:
|
|
13438
|
-
// Not used for completed status
|
|
13439
|
-
completedAt: Date.now(),
|
|
13440
|
-
error: result.error
|
|
13797
|
+
status: "running",
|
|
13798
|
+
startedAt: Date.now()
|
|
13441
13799
|
});
|
|
13442
13800
|
}
|
|
13443
|
-
|
|
13444
|
-
|
|
13801
|
+
const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : void 0;
|
|
13802
|
+
const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath;
|
|
13803
|
+
const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit;
|
|
13804
|
+
try {
|
|
13805
|
+
const judgeProvider = await resolveJudgeProvider(target);
|
|
13806
|
+
const runCaseOptions = {
|
|
13807
|
+
evalCase,
|
|
13808
|
+
provider: primaryProvider,
|
|
13809
|
+
target,
|
|
13810
|
+
evaluators: evaluatorRegistry,
|
|
13811
|
+
maxRetries,
|
|
13812
|
+
agentTimeoutMs,
|
|
13813
|
+
cache,
|
|
13814
|
+
useCache,
|
|
13815
|
+
now,
|
|
13816
|
+
judgeProvider,
|
|
13817
|
+
targetResolver,
|
|
13818
|
+
availableTargets,
|
|
13819
|
+
evalRunId,
|
|
13820
|
+
keepWorkspaces,
|
|
13821
|
+
cleanupWorkspaces,
|
|
13822
|
+
retainOnSuccess: resolvedRetainOnSuccess,
|
|
13823
|
+
retainOnFailure: resolvedRetainOnFailure,
|
|
13824
|
+
sharedWorkspacePath: testWorkspacePath,
|
|
13825
|
+
sharedBaselineCommit: testBaselineCommit,
|
|
13826
|
+
suiteWorkspaceFile,
|
|
13827
|
+
streamCallbacks,
|
|
13828
|
+
typeRegistry,
|
|
13829
|
+
repoManager,
|
|
13830
|
+
evalDir
|
|
13831
|
+
};
|
|
13832
|
+
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
13833
|
+
if (totalBudgetUsd !== void 0) {
|
|
13834
|
+
let caseCost;
|
|
13835
|
+
if (result.trials && result.trials.length > 0) {
|
|
13836
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
13837
|
+
if (trialCostSum > 0) {
|
|
13838
|
+
caseCost = trialCostSum;
|
|
13839
|
+
}
|
|
13840
|
+
} else {
|
|
13841
|
+
caseCost = result.costUsd;
|
|
13842
|
+
}
|
|
13843
|
+
if (caseCost !== void 0) {
|
|
13844
|
+
cumulativeBudgetCost += caseCost;
|
|
13845
|
+
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
13846
|
+
budgetExhausted = true;
|
|
13847
|
+
}
|
|
13848
|
+
}
|
|
13849
|
+
}
|
|
13850
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
13851
|
+
failOnErrorTriggered = true;
|
|
13852
|
+
}
|
|
13853
|
+
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
13854
|
+
result = { ...result, beforeAllOutput };
|
|
13855
|
+
beforeAllOutputAttached = true;
|
|
13856
|
+
}
|
|
13857
|
+
if (onProgress) {
|
|
13858
|
+
await onProgress({
|
|
13859
|
+
workerId,
|
|
13860
|
+
testId: evalCase.id,
|
|
13861
|
+
status: result.error ? "failed" : "completed",
|
|
13862
|
+
startedAt: 0,
|
|
13863
|
+
// Not used for completed status
|
|
13864
|
+
completedAt: Date.now(),
|
|
13865
|
+
error: result.error
|
|
13866
|
+
});
|
|
13867
|
+
}
|
|
13868
|
+
if (onResult) {
|
|
13869
|
+
await onResult(result);
|
|
13870
|
+
}
|
|
13871
|
+
return result;
|
|
13872
|
+
} catch (error) {
|
|
13873
|
+
if (onProgress) {
|
|
13874
|
+
await onProgress({
|
|
13875
|
+
workerId,
|
|
13876
|
+
testId: evalCase.id,
|
|
13877
|
+
status: "failed",
|
|
13878
|
+
completedAt: Date.now(),
|
|
13879
|
+
error: error instanceof Error ? error.message : String(error)
|
|
13880
|
+
});
|
|
13881
|
+
}
|
|
13882
|
+
throw error;
|
|
13883
|
+
} finally {
|
|
13884
|
+
if (testPoolSlot) {
|
|
13885
|
+
availablePoolSlots.push(testPoolSlot);
|
|
13886
|
+
}
|
|
13445
13887
|
}
|
|
13446
|
-
|
|
13447
|
-
|
|
13448
|
-
|
|
13449
|
-
|
|
13450
|
-
|
|
13451
|
-
|
|
13452
|
-
|
|
13453
|
-
|
|
13454
|
-
|
|
13455
|
-
|
|
13888
|
+
})
|
|
13889
|
+
);
|
|
13890
|
+
const settled = await Promise.allSettled(promises);
|
|
13891
|
+
const results = [];
|
|
13892
|
+
for (let i = 0; i < settled.length; i++) {
|
|
13893
|
+
const outcome = settled[i];
|
|
13894
|
+
if (outcome.status === "fulfilled") {
|
|
13895
|
+
results.push(outcome.value);
|
|
13896
|
+
} else {
|
|
13897
|
+
const evalCase = filteredEvalCases[i];
|
|
13898
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
13899
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
13900
|
+
const errorResult = buildErrorResult(
|
|
13901
|
+
evalCase,
|
|
13902
|
+
target.name,
|
|
13903
|
+
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
13904
|
+
outcome.reason,
|
|
13905
|
+
promptInputs,
|
|
13906
|
+
primaryProvider,
|
|
13907
|
+
"agent",
|
|
13908
|
+
"provider_error"
|
|
13909
|
+
);
|
|
13910
|
+
results.push(errorResult);
|
|
13911
|
+
if (onResult) {
|
|
13912
|
+
await onResult(errorResult);
|
|
13456
13913
|
}
|
|
13457
|
-
throw error;
|
|
13458
13914
|
}
|
|
13459
|
-
}
|
|
13460
|
-
|
|
13461
|
-
|
|
13462
|
-
|
|
13463
|
-
|
|
13464
|
-
|
|
13465
|
-
|
|
13466
|
-
|
|
13467
|
-
|
|
13468
|
-
|
|
13469
|
-
|
|
13470
|
-
|
|
13471
|
-
|
|
13472
|
-
|
|
13473
|
-
|
|
13474
|
-
|
|
13475
|
-
|
|
13476
|
-
|
|
13477
|
-
|
|
13478
|
-
|
|
13479
|
-
|
|
13480
|
-
|
|
13481
|
-
|
|
13482
|
-
if (onResult) {
|
|
13483
|
-
await onResult(errorResult);
|
|
13915
|
+
}
|
|
13916
|
+
const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
|
|
13917
|
+
const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all_tests;
|
|
13918
|
+
if (afterAllWorkspaces.length > 0 && hasHookCommand(suiteAfterAllHook)) {
|
|
13919
|
+
const afterAllHook = suiteAfterAllHook;
|
|
13920
|
+
for (const wsPath of afterAllWorkspaces) {
|
|
13921
|
+
const scriptContext = {
|
|
13922
|
+
workspacePath: wsPath,
|
|
13923
|
+
testId: "__after_all__",
|
|
13924
|
+
evalRunId,
|
|
13925
|
+
evalDir
|
|
13926
|
+
};
|
|
13927
|
+
try {
|
|
13928
|
+
const afterAllOutput = await executeWorkspaceScript(
|
|
13929
|
+
toScriptConfig(afterAllHook, "after_all_tests", "suite workspace"),
|
|
13930
|
+
scriptContext,
|
|
13931
|
+
"warn"
|
|
13932
|
+
);
|
|
13933
|
+
if (afterAllOutput && results.length > 0 && wsPath === afterAllWorkspaces[0]) {
|
|
13934
|
+
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
13935
|
+
}
|
|
13936
|
+
} catch {
|
|
13937
|
+
}
|
|
13484
13938
|
}
|
|
13485
13939
|
}
|
|
13486
|
-
|
|
13487
|
-
|
|
13488
|
-
|
|
13489
|
-
|
|
13490
|
-
|
|
13491
|
-
|
|
13492
|
-
|
|
13493
|
-
|
|
13494
|
-
|
|
13495
|
-
|
|
13496
|
-
suiteWorkspace.after_all,
|
|
13497
|
-
scriptContext,
|
|
13498
|
-
"warn"
|
|
13499
|
-
);
|
|
13500
|
-
if (afterAllOutput && results.length > 0) {
|
|
13501
|
-
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
13940
|
+
if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !useStaticWorkspace) {
|
|
13941
|
+
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
13942
|
+
if (hasFailure) {
|
|
13943
|
+
if (resolvedRetainOnFailure === "cleanup") {
|
|
13944
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13945
|
+
});
|
|
13946
|
+
}
|
|
13947
|
+
} else if (resolvedRetainOnSuccess === "cleanup") {
|
|
13948
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13949
|
+
});
|
|
13502
13950
|
}
|
|
13503
|
-
} catch {
|
|
13504
13951
|
}
|
|
13505
|
-
}
|
|
13506
|
-
if (sharedWorkspacePath) {
|
|
13507
|
-
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
13508
13952
|
if (cleanupWorkspaces) {
|
|
13509
|
-
await
|
|
13510
|
-
});
|
|
13511
|
-
} else if (!hasFailure && !keepWorkspaces) {
|
|
13512
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13953
|
+
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
13513
13954
|
});
|
|
13514
13955
|
}
|
|
13956
|
+
return results;
|
|
13957
|
+
} finally {
|
|
13958
|
+
if (poolManager) {
|
|
13959
|
+
if (poolSlot) {
|
|
13960
|
+
await poolManager.releaseSlot(poolSlot);
|
|
13961
|
+
}
|
|
13962
|
+
for (const slot of poolSlots) {
|
|
13963
|
+
if (slot !== poolSlot) {
|
|
13964
|
+
await poolManager.releaseSlot(slot).catch(() => {
|
|
13965
|
+
});
|
|
13966
|
+
}
|
|
13967
|
+
}
|
|
13968
|
+
}
|
|
13515
13969
|
}
|
|
13516
|
-
if (cleanupWorkspaces) {
|
|
13517
|
-
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
13518
|
-
});
|
|
13519
|
-
}
|
|
13520
|
-
return results;
|
|
13521
13970
|
}
|
|
13522
13971
|
async function runBatchEvaluation(options) {
|
|
13523
13972
|
const {
|
|
@@ -13689,6 +14138,8 @@ async function runEvalCase(options) {
|
|
|
13689
14138
|
evalRunId,
|
|
13690
14139
|
keepWorkspaces,
|
|
13691
14140
|
cleanupWorkspaces: forceCleanup,
|
|
14141
|
+
retainOnSuccess,
|
|
14142
|
+
retainOnFailure,
|
|
13692
14143
|
sharedWorkspacePath,
|
|
13693
14144
|
sharedBaselineCommit,
|
|
13694
14145
|
suiteWorkspaceFile,
|
|
@@ -13700,10 +14151,10 @@ async function runEvalCase(options) {
|
|
|
13700
14151
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
13701
14152
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
13702
14153
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
13703
|
-
const
|
|
14154
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
13704
14155
|
let cachedResponse;
|
|
13705
|
-
if (
|
|
13706
|
-
cachedResponse = await cache.get(
|
|
14156
|
+
if (cacheKey && cache) {
|
|
14157
|
+
cachedResponse = await cache.get(cacheKey);
|
|
13707
14158
|
}
|
|
13708
14159
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
13709
14160
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -13734,7 +14185,7 @@ async function runEvalCase(options) {
|
|
|
13734
14185
|
);
|
|
13735
14186
|
}
|
|
13736
14187
|
if (caseWorkspaceFile && workspacePath) {
|
|
13737
|
-
const copiedFile =
|
|
14188
|
+
const copiedFile = path39.join(workspacePath, path39.basename(caseWorkspaceFile));
|
|
13738
14189
|
try {
|
|
13739
14190
|
await stat7(copiedFile);
|
|
13740
14191
|
caseWorkspaceFile = copiedFile;
|
|
@@ -13742,12 +14193,12 @@ async function runEvalCase(options) {
|
|
|
13742
14193
|
}
|
|
13743
14194
|
}
|
|
13744
14195
|
}
|
|
13745
|
-
if (!workspacePath && (evalCase.workspace?.
|
|
14196
|
+
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
13746
14197
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
13747
14198
|
await mkdir12(workspacePath, { recursive: true });
|
|
13748
14199
|
}
|
|
13749
14200
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
13750
|
-
const perCaseRepoManager = new RepoManager(
|
|
14201
|
+
const perCaseRepoManager = new RepoManager(setupDebug);
|
|
13751
14202
|
try {
|
|
13752
14203
|
if (setupDebug) {
|
|
13753
14204
|
console.log(
|
|
@@ -13772,11 +14223,13 @@ async function runEvalCase(options) {
|
|
|
13772
14223
|
);
|
|
13773
14224
|
}
|
|
13774
14225
|
}
|
|
13775
|
-
|
|
13776
|
-
|
|
14226
|
+
const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all_tests;
|
|
14227
|
+
if (workspacePath && hasHookCommand(caseBeforeAllHook)) {
|
|
14228
|
+
const beforeAllHook = caseBeforeAllHook;
|
|
14229
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
13777
14230
|
if (setupDebug) {
|
|
13778
14231
|
console.log(
|
|
13779
|
-
`[setup] test=${evalCase.id} running before_all in cwd=${
|
|
14232
|
+
`[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
13780
14233
|
);
|
|
13781
14234
|
}
|
|
13782
14235
|
const scriptContext = {
|
|
@@ -13789,7 +14242,7 @@ async function runEvalCase(options) {
|
|
|
13789
14242
|
};
|
|
13790
14243
|
try {
|
|
13791
14244
|
beforeAllOutput = await executeWorkspaceScript(
|
|
13792
|
-
evalCase.
|
|
14245
|
+
toScriptConfig(beforeAllHook, "before_all_tests", `test '${evalCase.id}'`),
|
|
13793
14246
|
scriptContext
|
|
13794
14247
|
);
|
|
13795
14248
|
if (setupDebug) {
|
|
@@ -13814,7 +14267,9 @@ async function runEvalCase(options) {
|
|
|
13814
14267
|
}
|
|
13815
14268
|
}
|
|
13816
14269
|
}
|
|
13817
|
-
|
|
14270
|
+
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each_test;
|
|
14271
|
+
if (workspacePath && hasHookCommand(caseBeforeEachHook)) {
|
|
14272
|
+
const beforeEachHook = caseBeforeEachHook;
|
|
13818
14273
|
const scriptContext = {
|
|
13819
14274
|
workspacePath,
|
|
13820
14275
|
testId: evalCase.id,
|
|
@@ -13825,7 +14280,7 @@ async function runEvalCase(options) {
|
|
|
13825
14280
|
};
|
|
13826
14281
|
try {
|
|
13827
14282
|
beforeEachOutput = await executeWorkspaceScript(
|
|
13828
|
-
evalCase.
|
|
14283
|
+
toScriptConfig(beforeEachHook, "before_each_test", `test '${evalCase.id}'`),
|
|
13829
14284
|
scriptContext
|
|
13830
14285
|
);
|
|
13831
14286
|
} catch (error) {
|
|
@@ -13913,8 +14368,8 @@ async function runEvalCase(options) {
|
|
|
13913
14368
|
}
|
|
13914
14369
|
return errorResult;
|
|
13915
14370
|
}
|
|
13916
|
-
if (
|
|
13917
|
-
await cache.set(
|
|
14371
|
+
if (cacheKey && cache && !cachedResponse) {
|
|
14372
|
+
await cache.set(cacheKey, providerResponse);
|
|
13918
14373
|
}
|
|
13919
14374
|
const output = providerResponse.output;
|
|
13920
14375
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -13942,17 +14397,19 @@ async function runEvalCase(options) {
|
|
|
13942
14397
|
}
|
|
13943
14398
|
}
|
|
13944
14399
|
const providerError = extractProviderError(providerResponse);
|
|
13945
|
-
if (repoManager && workspacePath && evalCase.workspace?.reset
|
|
14400
|
+
if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each_test?.reset && evalCase.workspace.hooks.after_each_test.reset !== "none" && evalCase.workspace.repos) {
|
|
13946
14401
|
try {
|
|
13947
14402
|
await repoManager.reset(
|
|
13948
14403
|
evalCase.workspace.repos,
|
|
13949
14404
|
workspacePath,
|
|
13950
|
-
evalCase.workspace.reset
|
|
14405
|
+
evalCase.workspace.hooks.after_each_test.reset
|
|
13951
14406
|
);
|
|
13952
14407
|
} catch {
|
|
13953
14408
|
}
|
|
13954
14409
|
}
|
|
13955
|
-
|
|
14410
|
+
const caseAfterEachHook = evalCase.workspace?.hooks?.after_each_test;
|
|
14411
|
+
if (workspacePath && hasHookCommand(caseAfterEachHook)) {
|
|
14412
|
+
const afterEachHook = caseAfterEachHook;
|
|
13956
14413
|
const scriptContext = {
|
|
13957
14414
|
workspacePath,
|
|
13958
14415
|
testId: evalCase.id,
|
|
@@ -13963,7 +14420,7 @@ async function runEvalCase(options) {
|
|
|
13963
14420
|
};
|
|
13964
14421
|
try {
|
|
13965
14422
|
afterEachOutput = await executeWorkspaceScript(
|
|
13966
|
-
evalCase.
|
|
14423
|
+
toScriptConfig(afterEachHook, "after_each_test", `test '${evalCase.id}'`),
|
|
13967
14424
|
scriptContext,
|
|
13968
14425
|
"warn"
|
|
13969
14426
|
);
|
|
@@ -14013,8 +14470,13 @@ async function runEvalCase(options) {
|
|
|
14013
14470
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14014
14471
|
});
|
|
14015
14472
|
} else if (isFailure) {
|
|
14016
|
-
|
|
14017
|
-
|
|
14473
|
+
if ((retainOnFailure ?? "keep") === "cleanup") {
|
|
14474
|
+
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14475
|
+
});
|
|
14476
|
+
} else {
|
|
14477
|
+
return { ...finalResult, workspacePath };
|
|
14478
|
+
}
|
|
14479
|
+
} else if ((retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup")) !== "keep") {
|
|
14018
14480
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14019
14481
|
});
|
|
14020
14482
|
}
|
|
@@ -14032,11 +14494,12 @@ async function runEvalCase(options) {
|
|
|
14032
14494
|
"evaluator_error"
|
|
14033
14495
|
);
|
|
14034
14496
|
if (workspacePath && !isSharedWorkspace) {
|
|
14035
|
-
if (forceCleanup) {
|
|
14497
|
+
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
14036
14498
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
14037
14499
|
});
|
|
14500
|
+
} else {
|
|
14501
|
+
return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
|
|
14038
14502
|
}
|
|
14039
|
-
return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
|
|
14040
14503
|
}
|
|
14041
14504
|
return { ...errorResult, beforeEachOutput, afterEachOutput };
|
|
14042
14505
|
}
|
|
@@ -14055,7 +14518,9 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
14055
14518
|
useCache: false,
|
|
14056
14519
|
// Force cleanup for intermediate trials
|
|
14057
14520
|
cleanupWorkspaces: isLastDeclaredTrial ? options.cleanupWorkspaces : true,
|
|
14058
|
-
keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false
|
|
14521
|
+
keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false,
|
|
14522
|
+
retainOnSuccess: isLastDeclaredTrial ? options.retainOnSuccess : "cleanup",
|
|
14523
|
+
retainOnFailure: isLastDeclaredTrial ? options.retainOnFailure : "cleanup"
|
|
14059
14524
|
};
|
|
14060
14525
|
const result = await runEvalCase(trialOptions);
|
|
14061
14526
|
allResults.push(result);
|
|
@@ -14344,7 +14809,7 @@ async function runEvaluatorList(options) {
|
|
|
14344
14809
|
fileChanges,
|
|
14345
14810
|
workspacePath
|
|
14346
14811
|
};
|
|
14347
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
14812
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path39.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
14348
14813
|
const dispatchContext = {
|
|
14349
14814
|
judgeProvider,
|
|
14350
14815
|
targetResolver,
|
|
@@ -14647,7 +15112,7 @@ function computeWeightedMean(entries) {
|
|
|
14647
15112
|
|
|
14648
15113
|
// src/evaluation/evaluate.ts
|
|
14649
15114
|
import { existsSync as existsSync3 } from "node:fs";
|
|
14650
|
-
import
|
|
15115
|
+
import path40 from "node:path";
|
|
14651
15116
|
async function evaluate(config) {
|
|
14652
15117
|
const startTime = Date.now();
|
|
14653
15118
|
if (config.tests && config.specFile) {
|
|
@@ -14669,13 +15134,13 @@ async function evaluate(config) {
|
|
|
14669
15134
|
let evalCases;
|
|
14670
15135
|
let testFilePath;
|
|
14671
15136
|
if (config.specFile) {
|
|
14672
|
-
testFilePath =
|
|
15137
|
+
testFilePath = path40.resolve(config.specFile);
|
|
14673
15138
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
14674
15139
|
verbose: config.verbose,
|
|
14675
15140
|
filter: config.filter
|
|
14676
15141
|
});
|
|
14677
15142
|
} else {
|
|
14678
|
-
testFilePath =
|
|
15143
|
+
testFilePath = path40.join(process.cwd(), "__programmatic__.yaml");
|
|
14679
15144
|
evalCases = (config.tests ?? []).map((test) => {
|
|
14680
15145
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
14681
15146
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -14761,10 +15226,10 @@ function computeSummary(results, durationMs) {
|
|
|
14761
15226
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
14762
15227
|
async function discoverDefaultTarget(repoRoot) {
|
|
14763
15228
|
const cwd = process.cwd();
|
|
14764
|
-
const chain = buildDirectoryChain(
|
|
15229
|
+
const chain = buildDirectoryChain(path40.join(cwd, "_placeholder"), repoRoot);
|
|
14765
15230
|
for (const dir of chain) {
|
|
14766
15231
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
14767
|
-
const targetsPath =
|
|
15232
|
+
const targetsPath = path40.join(dir, candidate);
|
|
14768
15233
|
if (!existsSync3(targetsPath)) continue;
|
|
14769
15234
|
try {
|
|
14770
15235
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
@@ -14779,10 +15244,10 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
14779
15244
|
async function loadEnvHierarchy(repoRoot) {
|
|
14780
15245
|
const { readFileSync: readFileSync2 } = await import("node:fs");
|
|
14781
15246
|
const cwd = process.cwd();
|
|
14782
|
-
const chain = buildDirectoryChain(
|
|
15247
|
+
const chain = buildDirectoryChain(path40.join(cwd, "_placeholder"), repoRoot);
|
|
14783
15248
|
const envFiles = [];
|
|
14784
15249
|
for (const dir of chain) {
|
|
14785
|
-
const envPath =
|
|
15250
|
+
const envPath = path40.join(dir, ".env");
|
|
14786
15251
|
if (existsSync3(envPath)) envFiles.push(envPath);
|
|
14787
15252
|
}
|
|
14788
15253
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
@@ -14963,8 +15428,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
14963
15428
|
}
|
|
14964
15429
|
|
|
14965
15430
|
// src/evaluation/cache/response-cache.ts
|
|
14966
|
-
import { mkdir as mkdir13, readFile as
|
|
14967
|
-
import
|
|
15431
|
+
import { mkdir as mkdir13, readFile as readFile12, writeFile as writeFile8 } from "node:fs/promises";
|
|
15432
|
+
import path41 from "node:path";
|
|
14968
15433
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
14969
15434
|
var ResponseCache = class {
|
|
14970
15435
|
cachePath;
|
|
@@ -14974,7 +15439,7 @@ var ResponseCache = class {
|
|
|
14974
15439
|
async get(key) {
|
|
14975
15440
|
const filePath = this.keyToPath(key);
|
|
14976
15441
|
try {
|
|
14977
|
-
const data = await
|
|
15442
|
+
const data = await readFile12(filePath, "utf8");
|
|
14978
15443
|
return JSON.parse(data);
|
|
14979
15444
|
} catch {
|
|
14980
15445
|
return void 0;
|
|
@@ -14982,13 +15447,13 @@ var ResponseCache = class {
|
|
|
14982
15447
|
}
|
|
14983
15448
|
async set(key, value) {
|
|
14984
15449
|
const filePath = this.keyToPath(key);
|
|
14985
|
-
const dir =
|
|
15450
|
+
const dir = path41.dirname(filePath);
|
|
14986
15451
|
await mkdir13(dir, { recursive: true });
|
|
14987
15452
|
await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
14988
15453
|
}
|
|
14989
15454
|
keyToPath(key) {
|
|
14990
15455
|
const prefix = key.slice(0, 2);
|
|
14991
|
-
return
|
|
15456
|
+
return path41.join(this.cachePath, prefix, `${key}.json`);
|
|
14992
15457
|
}
|
|
14993
15458
|
};
|
|
14994
15459
|
function shouldEnableCache(params) {
|
|
@@ -15470,6 +15935,7 @@ export {
|
|
|
15470
15935
|
TokenUsageEvaluator,
|
|
15471
15936
|
ToolTrajectoryEvaluator,
|
|
15472
15937
|
WorkspaceCreationError,
|
|
15938
|
+
WorkspacePoolManager,
|
|
15473
15939
|
assembleLlmJudgePrompt,
|
|
15474
15940
|
avgToolDurationMs,
|
|
15475
15941
|
buildDirectoryChain,
|
|
@@ -15484,6 +15950,7 @@ export {
|
|
|
15484
15950
|
cleanupEvalWorkspaces,
|
|
15485
15951
|
cleanupWorkspace,
|
|
15486
15952
|
computeTraceSummary,
|
|
15953
|
+
computeWorkspaceFingerprint,
|
|
15487
15954
|
consumeClaudeLogEntries,
|
|
15488
15955
|
consumeCodexLogEntries,
|
|
15489
15956
|
consumeCopilotCliLogEntries,
|
|
@@ -15516,11 +15983,11 @@ export {
|
|
|
15516
15983
|
freeformEvaluationSchema,
|
|
15517
15984
|
generateRubrics,
|
|
15518
15985
|
getAgentvHome,
|
|
15519
|
-
getGitCacheRoot,
|
|
15520
15986
|
getHitCount,
|
|
15521
15987
|
getSubagentsRoot,
|
|
15522
15988
|
getTraceStateRoot,
|
|
15523
15989
|
getWorkspacePath,
|
|
15990
|
+
getWorkspacePoolRoot,
|
|
15524
15991
|
getWorkspacesRoot,
|
|
15525
15992
|
initializeBaseline,
|
|
15526
15993
|
isEvaluatorKind,
|