@agentv/core 2.14.3 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1009 -504
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +85 -1
- package/dist/index.d.ts +85 -1
- package/dist/index.js +950 -448
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -612,6 +612,17 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
612
612
|
} else if (otelFile !== void 0) {
|
|
613
613
|
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
614
614
|
}
|
|
615
|
+
if (typeof obj.pool_workspaces === "boolean") {
|
|
616
|
+
result.pool_workspaces = obj.pool_workspaces;
|
|
617
|
+
} else if (obj.pool_workspaces !== void 0) {
|
|
618
|
+
logWarning(`Invalid execution.pool_workspaces in ${configPath}, expected boolean`);
|
|
619
|
+
}
|
|
620
|
+
const poolSlots = obj.pool_slots;
|
|
621
|
+
if (typeof poolSlots === "number" && Number.isInteger(poolSlots) && poolSlots >= 1 && poolSlots <= 50) {
|
|
622
|
+
result.pool_slots = poolSlots;
|
|
623
|
+
} else if (poolSlots !== void 0) {
|
|
624
|
+
logWarning(`Invalid execution.pool_slots in ${configPath}, expected integer 1-50`);
|
|
625
|
+
}
|
|
615
626
|
return Object.keys(result).length > 0 ? result : void 0;
|
|
616
627
|
}
|
|
617
628
|
function logWarning(message) {
|
|
@@ -2053,6 +2064,7 @@ async function processMessages(options) {
|
|
|
2053
2064
|
repoRootPath,
|
|
2054
2065
|
guidelinePatterns,
|
|
2055
2066
|
guidelinePaths,
|
|
2067
|
+
treatFileSegmentsAsGuidelines,
|
|
2056
2068
|
textParts,
|
|
2057
2069
|
messageType,
|
|
2058
2070
|
verbose
|
|
@@ -2100,16 +2112,20 @@ async function processMessages(options) {
|
|
|
2100
2112
|
}
|
|
2101
2113
|
try {
|
|
2102
2114
|
const fileContent = (await readFile4(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2115
|
+
const classifyAsGuideline = shouldTreatAsGuideline({
|
|
2116
|
+
messageType,
|
|
2117
|
+
resolvedPath,
|
|
2118
|
+
repoRootPath,
|
|
2119
|
+
guidelinePatterns,
|
|
2120
|
+
treatFileSegmentsAsGuidelines
|
|
2121
|
+
});
|
|
2122
|
+
if (classifyAsGuideline && guidelinePaths) {
|
|
2123
|
+
guidelinePaths.push(path5.resolve(resolvedPath));
|
|
2124
|
+
if (verbose) {
|
|
2125
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
2126
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
2112
2127
|
}
|
|
2128
|
+
continue;
|
|
2113
2129
|
}
|
|
2114
2130
|
segments.push({
|
|
2115
2131
|
type: "file",
|
|
@@ -2138,6 +2154,26 @@ async function processMessages(options) {
|
|
|
2138
2154
|
}
|
|
2139
2155
|
return segments;
|
|
2140
2156
|
}
|
|
2157
|
+
function shouldTreatAsGuideline(options) {
|
|
2158
|
+
const {
|
|
2159
|
+
messageType,
|
|
2160
|
+
resolvedPath,
|
|
2161
|
+
repoRootPath,
|
|
2162
|
+
guidelinePatterns,
|
|
2163
|
+
treatFileSegmentsAsGuidelines
|
|
2164
|
+
} = options;
|
|
2165
|
+
if (messageType !== "input") {
|
|
2166
|
+
return false;
|
|
2167
|
+
}
|
|
2168
|
+
if (treatFileSegmentsAsGuidelines) {
|
|
2169
|
+
return true;
|
|
2170
|
+
}
|
|
2171
|
+
if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
2172
|
+
return false;
|
|
2173
|
+
}
|
|
2174
|
+
const relativeToRepo = path5.relative(repoRootPath, resolvedPath);
|
|
2175
|
+
return isGuidelineFile(relativeToRepo, guidelinePatterns);
|
|
2176
|
+
}
|
|
2141
2177
|
function asString3(value) {
|
|
2142
2178
|
return typeof value === "string" ? value : void 0;
|
|
2143
2179
|
}
|
|
@@ -2476,6 +2512,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2476
2512
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
2477
2513
|
console.log(` - ${guidelinePath}`);
|
|
2478
2514
|
}
|
|
2515
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
2516
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
2479
2517
|
} else {
|
|
2480
2518
|
console.log(" No guidelines found");
|
|
2481
2519
|
}
|
|
@@ -2845,7 +2883,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2845
2883
|
} else {
|
|
2846
2884
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
2847
2885
|
}
|
|
2848
|
-
const suiteWorkspace =
|
|
2886
|
+
const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
|
|
2849
2887
|
const suiteInputMessages = expandInputShorthand(suite.input);
|
|
2850
2888
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
2851
2889
|
const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
|
|
@@ -2881,12 +2919,24 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2881
2919
|
}
|
|
2882
2920
|
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
2883
2921
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
2884
|
-
const
|
|
2922
|
+
const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
|
|
2923
|
+
const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
|
|
2885
2924
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
2886
2925
|
const guidelinePaths = [];
|
|
2887
2926
|
const inputTextParts = [];
|
|
2888
|
-
const
|
|
2889
|
-
messages:
|
|
2927
|
+
const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
|
|
2928
|
+
messages: effectiveSuiteInputMessages,
|
|
2929
|
+
searchRoots,
|
|
2930
|
+
repoRootPath,
|
|
2931
|
+
guidelinePatterns,
|
|
2932
|
+
guidelinePaths,
|
|
2933
|
+
treatFileSegmentsAsGuidelines: true,
|
|
2934
|
+
textParts: inputTextParts,
|
|
2935
|
+
messageType: "input",
|
|
2936
|
+
verbose
|
|
2937
|
+
}) : [];
|
|
2938
|
+
const testInputSegments = await processMessages({
|
|
2939
|
+
messages: testInputMessages,
|
|
2890
2940
|
searchRoots,
|
|
2891
2941
|
repoRootPath,
|
|
2892
2942
|
guidelinePatterns,
|
|
@@ -2895,6 +2945,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2895
2945
|
messageType: "input",
|
|
2896
2946
|
verbose
|
|
2897
2947
|
});
|
|
2948
|
+
const inputSegments = [...suiteInputSegments, ...testInputSegments];
|
|
2898
2949
|
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
2899
2950
|
messages: expectedMessages,
|
|
2900
2951
|
searchRoots,
|
|
@@ -2942,7 +2993,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2942
2993
|
...guidelinePaths.map((guidelinePath) => path8.resolve(guidelinePath)),
|
|
2943
2994
|
...userFilePaths
|
|
2944
2995
|
];
|
|
2945
|
-
const caseWorkspace =
|
|
2996
|
+
const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
|
|
2946
2997
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
2947
2998
|
const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
|
|
2948
2999
|
const caseTargets = extractTargetsFromTestCase(evalcase);
|
|
@@ -2973,6 +3024,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2973
3024
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
2974
3025
|
console.log(` - ${guidelinePath}`);
|
|
2975
3026
|
}
|
|
3027
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
3028
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
2976
3029
|
} else {
|
|
2977
3030
|
console.log(" No guidelines found");
|
|
2978
3031
|
}
|
|
@@ -3072,6 +3125,26 @@ function parseResetConfig(raw) {
|
|
|
3072
3125
|
...afterEach !== void 0 && { after_each: afterEach }
|
|
3073
3126
|
};
|
|
3074
3127
|
}
|
|
3128
|
+
async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
3129
|
+
if (typeof raw === "string") {
|
|
3130
|
+
const workspaceFilePath = path8.resolve(evalFileDir, raw);
|
|
3131
|
+
let content;
|
|
3132
|
+
try {
|
|
3133
|
+
content = await readFile7(workspaceFilePath, "utf8");
|
|
3134
|
+
} catch {
|
|
3135
|
+
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
3136
|
+
}
|
|
3137
|
+
const parsed = parse2(content);
|
|
3138
|
+
if (!isJsonObject(parsed)) {
|
|
3139
|
+
throw new Error(
|
|
3140
|
+
`Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
|
|
3141
|
+
);
|
|
3142
|
+
}
|
|
3143
|
+
const workspaceFileDir = path8.dirname(workspaceFilePath);
|
|
3144
|
+
return parseWorkspaceConfig(parsed, workspaceFileDir);
|
|
3145
|
+
}
|
|
3146
|
+
return parseWorkspaceConfig(raw, evalFileDir);
|
|
3147
|
+
}
|
|
3075
3148
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
3076
3149
|
if (!isJsonObject(raw)) return void 0;
|
|
3077
3150
|
const obj = raw;
|
|
@@ -7174,6 +7247,9 @@ function getSubagentsRoot() {
|
|
|
7174
7247
|
function getTraceStateRoot() {
|
|
7175
7248
|
return path21.join(getAgentvHome(), "trace-state");
|
|
7176
7249
|
}
|
|
7250
|
+
function getWorkspacePoolRoot() {
|
|
7251
|
+
return path21.join(getAgentvHome(), "workspace-pool");
|
|
7252
|
+
}
|
|
7177
7253
|
|
|
7178
7254
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
7179
7255
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
@@ -7996,8 +8072,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
7996
8072
|
|
|
7997
8073
|
**IMPORTANT**: Follow these exact steps:
|
|
7998
8074
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
7999
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
8000
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
8001
8075
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
8002
8076
|
\`\`\`
|
|
8003
8077
|
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
@@ -8014,8 +8088,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
8014
8088
|
|
|
8015
8089
|
**IMPORTANT**: Follow these exact steps:
|
|
8016
8090
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
8017
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
8018
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
8019
8091
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
8020
8092
|
3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
|
|
8021
8093
|
`;
|
|
@@ -8628,16 +8700,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
8628
8700
|
});
|
|
8629
8701
|
}
|
|
8630
8702
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
8631
|
-
const { mkdir:
|
|
8703
|
+
const { mkdir: mkdir15, readFile: readFile13, rm: rm7, writeFile: writeFile10 } = await import("node:fs/promises");
|
|
8632
8704
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
8633
|
-
const
|
|
8705
|
+
const path42 = await import("node:path");
|
|
8634
8706
|
const { randomUUID: randomUUID8 } = await import("node:crypto");
|
|
8635
|
-
const dir =
|
|
8636
|
-
await
|
|
8637
|
-
const stdinPath =
|
|
8638
|
-
const stdoutPath =
|
|
8639
|
-
const stderrPath =
|
|
8640
|
-
await
|
|
8707
|
+
const dir = path42.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
8708
|
+
await mkdir15(dir, { recursive: true });
|
|
8709
|
+
const stdinPath = path42.join(dir, "stdin.txt");
|
|
8710
|
+
const stdoutPath = path42.join(dir, "stdout.txt");
|
|
8711
|
+
const stderrPath = path42.join(dir, "stderr.txt");
|
|
8712
|
+
await writeFile10(stdinPath, stdinPayload, "utf8");
|
|
8641
8713
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
8642
8714
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
8643
8715
|
try {
|
|
@@ -8666,11 +8738,11 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
8666
8738
|
resolve(code ?? 0);
|
|
8667
8739
|
});
|
|
8668
8740
|
});
|
|
8669
|
-
const stdout = (await
|
|
8670
|
-
const stderr = (await
|
|
8741
|
+
const stdout = (await readFile13(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
8742
|
+
const stderr = (await readFile13(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
8671
8743
|
return { stdout, stderr, exitCode };
|
|
8672
8744
|
} finally {
|
|
8673
|
-
await
|
|
8745
|
+
await rm7(dir, { recursive: true, force: true });
|
|
8674
8746
|
}
|
|
8675
8747
|
}
|
|
8676
8748
|
|
|
@@ -8988,7 +9060,7 @@ var CodeEvaluator = class {
|
|
|
8988
9060
|
outputPath,
|
|
8989
9061
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
8990
9062
|
inputFiles: context.evalCase.file_paths.filter(
|
|
8991
|
-
(
|
|
9063
|
+
(path42) => !context.evalCase.guideline_paths.includes(path42)
|
|
8992
9064
|
),
|
|
8993
9065
|
input: context.evalCase.input,
|
|
8994
9066
|
trace: context.trace ?? null,
|
|
@@ -9238,6 +9310,8 @@ ${context.fileChanges}`;
|
|
|
9238
9310
|
};
|
|
9239
9311
|
} catch (e) {
|
|
9240
9312
|
const message = e instanceof Error ? e.message : String(e);
|
|
9313
|
+
const evalName = context.evaluator?.name ?? "llm-judge";
|
|
9314
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
9241
9315
|
return {
|
|
9242
9316
|
score: 0,
|
|
9243
9317
|
verdict: "skip",
|
|
@@ -9266,24 +9340,39 @@ ${context.fileChanges}`;
|
|
|
9266
9340
|
systemPrompt,
|
|
9267
9341
|
target: judgeProvider.targetName
|
|
9268
9342
|
};
|
|
9269
|
-
|
|
9270
|
-
|
|
9271
|
-
|
|
9272
|
-
|
|
9273
|
-
|
|
9274
|
-
|
|
9275
|
-
|
|
9276
|
-
|
|
9277
|
-
|
|
9278
|
-
|
|
9279
|
-
|
|
9280
|
-
|
|
9281
|
-
|
|
9282
|
-
|
|
9283
|
-
|
|
9284
|
-
|
|
9285
|
-
|
|
9286
|
-
|
|
9343
|
+
try {
|
|
9344
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
9345
|
+
context,
|
|
9346
|
+
judgeProvider,
|
|
9347
|
+
systemPrompt,
|
|
9348
|
+
userPrompt: prompt,
|
|
9349
|
+
schema: rubricEvaluationSchema
|
|
9350
|
+
});
|
|
9351
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
9352
|
+
return {
|
|
9353
|
+
score,
|
|
9354
|
+
verdict,
|
|
9355
|
+
hits,
|
|
9356
|
+
misses,
|
|
9357
|
+
expectedAspectCount: rubrics.length,
|
|
9358
|
+
reasoning: data.overall_reasoning,
|
|
9359
|
+
evaluatorRawRequest,
|
|
9360
|
+
tokenUsage
|
|
9361
|
+
};
|
|
9362
|
+
} catch (e) {
|
|
9363
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
9364
|
+
const evalName = context.evaluator?.name ?? "llm-judge";
|
|
9365
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
9366
|
+
return {
|
|
9367
|
+
score: 0,
|
|
9368
|
+
verdict: "skip",
|
|
9369
|
+
hits: [],
|
|
9370
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
9371
|
+
expectedAspectCount: rubrics.length,
|
|
9372
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
9373
|
+
evaluatorRawRequest
|
|
9374
|
+
};
|
|
9375
|
+
}
|
|
9287
9376
|
}
|
|
9288
9377
|
/**
|
|
9289
9378
|
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
@@ -9297,25 +9386,40 @@ ${context.fileChanges}`;
|
|
|
9297
9386
|
systemPrompt,
|
|
9298
9387
|
target: judgeProvider.targetName
|
|
9299
9388
|
};
|
|
9300
|
-
|
|
9301
|
-
|
|
9302
|
-
|
|
9303
|
-
|
|
9304
|
-
|
|
9305
|
-
|
|
9306
|
-
|
|
9307
|
-
|
|
9308
|
-
|
|
9309
|
-
|
|
9310
|
-
|
|
9311
|
-
|
|
9312
|
-
|
|
9313
|
-
|
|
9314
|
-
|
|
9315
|
-
|
|
9316
|
-
|
|
9317
|
-
|
|
9318
|
-
|
|
9389
|
+
try {
|
|
9390
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
9391
|
+
context,
|
|
9392
|
+
judgeProvider,
|
|
9393
|
+
systemPrompt,
|
|
9394
|
+
userPrompt: prompt,
|
|
9395
|
+
schema: scoreRangeEvaluationSchema
|
|
9396
|
+
});
|
|
9397
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
9398
|
+
return {
|
|
9399
|
+
score,
|
|
9400
|
+
verdict,
|
|
9401
|
+
hits,
|
|
9402
|
+
misses,
|
|
9403
|
+
expectedAspectCount: rubrics.length,
|
|
9404
|
+
reasoning: data.overall_reasoning,
|
|
9405
|
+
evaluatorRawRequest,
|
|
9406
|
+
details,
|
|
9407
|
+
tokenUsage
|
|
9408
|
+
};
|
|
9409
|
+
} catch (e) {
|
|
9410
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
9411
|
+
const evalName = context.evaluator?.name ?? "llm-judge";
|
|
9412
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
9413
|
+
return {
|
|
9414
|
+
score: 0,
|
|
9415
|
+
verdict: "skip",
|
|
9416
|
+
hits: [],
|
|
9417
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
9418
|
+
expectedAspectCount: rubrics.length,
|
|
9419
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
9420
|
+
evaluatorRawRequest
|
|
9421
|
+
};
|
|
9422
|
+
}
|
|
9319
9423
|
}
|
|
9320
9424
|
/**
|
|
9321
9425
|
* Build prompt for score-range rubric evaluation.
|
|
@@ -9601,19 +9705,13 @@ var CompositeEvaluator = class {
|
|
|
9601
9705
|
runWeightedAverage(results, weights) {
|
|
9602
9706
|
let totalWeight = 0;
|
|
9603
9707
|
let weightedSum = 0;
|
|
9708
|
+
let evaluatedCount = 0;
|
|
9604
9709
|
const allHits = [];
|
|
9605
9710
|
const allMisses = [];
|
|
9606
9711
|
const reasoningParts = [];
|
|
9607
9712
|
const scores = [];
|
|
9608
9713
|
for (const member of results) {
|
|
9609
9714
|
const weight = weights?.[member.id] ?? 1;
|
|
9610
|
-
totalWeight += weight;
|
|
9611
|
-
weightedSum += member.result.score * weight;
|
|
9612
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9613
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9614
|
-
if (member.result.reasoning) {
|
|
9615
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9616
|
-
}
|
|
9617
9715
|
scores.push({
|
|
9618
9716
|
name: member.id,
|
|
9619
9717
|
type: member.type,
|
|
@@ -9628,6 +9726,32 @@ var CompositeEvaluator = class {
|
|
|
9628
9726
|
details: member.result.details,
|
|
9629
9727
|
tokenUsage: member.result.tokenUsage
|
|
9630
9728
|
});
|
|
9729
|
+
if (member.result.verdict === "skip") {
|
|
9730
|
+
continue;
|
|
9731
|
+
}
|
|
9732
|
+
evaluatedCount++;
|
|
9733
|
+
totalWeight += weight;
|
|
9734
|
+
weightedSum += member.result.score * weight;
|
|
9735
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9736
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9737
|
+
if (member.result.reasoning) {
|
|
9738
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9739
|
+
}
|
|
9740
|
+
}
|
|
9741
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
9742
|
+
return {
|
|
9743
|
+
score: 0,
|
|
9744
|
+
verdict: "skip",
|
|
9745
|
+
hits: [],
|
|
9746
|
+
misses: [],
|
|
9747
|
+
expectedAspectCount: 1,
|
|
9748
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
9749
|
+
evaluatorRawRequest: {
|
|
9750
|
+
aggregator: "weighted_average",
|
|
9751
|
+
...weights ? { weights } : {}
|
|
9752
|
+
},
|
|
9753
|
+
scores
|
|
9754
|
+
};
|
|
9631
9755
|
}
|
|
9632
9756
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
9633
9757
|
return {
|
|
@@ -9651,19 +9775,8 @@ var CompositeEvaluator = class {
|
|
|
9651
9775
|
const reasoningParts = [];
|
|
9652
9776
|
let passingCount = 0;
|
|
9653
9777
|
let borderlineCount = 0;
|
|
9778
|
+
let evaluatedCount = 0;
|
|
9654
9779
|
for (const member of results) {
|
|
9655
|
-
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
9656
|
-
if (isPassing) {
|
|
9657
|
-
passingCount++;
|
|
9658
|
-
if (member.result.verdict === "borderline") {
|
|
9659
|
-
borderlineCount++;
|
|
9660
|
-
}
|
|
9661
|
-
}
|
|
9662
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9663
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9664
|
-
if (member.result.reasoning) {
|
|
9665
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9666
|
-
}
|
|
9667
9780
|
scores.push({
|
|
9668
9781
|
name: member.id,
|
|
9669
9782
|
type: member.type,
|
|
@@ -9677,8 +9790,39 @@ var CompositeEvaluator = class {
|
|
|
9677
9790
|
details: member.result.details,
|
|
9678
9791
|
tokenUsage: member.result.tokenUsage
|
|
9679
9792
|
});
|
|
9793
|
+
if (member.result.verdict === "skip") {
|
|
9794
|
+
continue;
|
|
9795
|
+
}
|
|
9796
|
+
evaluatedCount++;
|
|
9797
|
+
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
9798
|
+
if (isPassing) {
|
|
9799
|
+
passingCount++;
|
|
9800
|
+
if (member.result.verdict === "borderline") {
|
|
9801
|
+
borderlineCount++;
|
|
9802
|
+
}
|
|
9803
|
+
}
|
|
9804
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
9805
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
9806
|
+
if (member.result.reasoning) {
|
|
9807
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
9808
|
+
}
|
|
9809
|
+
}
|
|
9810
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
9811
|
+
return {
|
|
9812
|
+
score: 0,
|
|
9813
|
+
verdict: "skip",
|
|
9814
|
+
hits: [],
|
|
9815
|
+
misses: [],
|
|
9816
|
+
expectedAspectCount: 1,
|
|
9817
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
9818
|
+
evaluatorRawRequest: {
|
|
9819
|
+
aggregator: "threshold",
|
|
9820
|
+
threshold
|
|
9821
|
+
},
|
|
9822
|
+
scores
|
|
9823
|
+
};
|
|
9680
9824
|
}
|
|
9681
|
-
const totalCount =
|
|
9825
|
+
const totalCount = evaluatedCount;
|
|
9682
9826
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
9683
9827
|
const pass = score >= threshold;
|
|
9684
9828
|
if (pass && borderlineCount > 0) {
|
|
@@ -10186,115 +10330,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
10186
10330
|
* Evaluate a single field against the expected value.
|
|
10187
10331
|
*/
|
|
10188
10332
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
10189
|
-
const { path:
|
|
10190
|
-
const candidateValue = resolvePath(candidateData,
|
|
10191
|
-
const expectedValue = resolvePath(expectedData,
|
|
10333
|
+
const { path: path42, match, required = true, weight = 1 } = fieldConfig;
|
|
10334
|
+
const candidateValue = resolvePath(candidateData, path42);
|
|
10335
|
+
const expectedValue = resolvePath(expectedData, path42);
|
|
10192
10336
|
if (expectedValue === void 0) {
|
|
10193
10337
|
return {
|
|
10194
|
-
path:
|
|
10338
|
+
path: path42,
|
|
10195
10339
|
score: 1,
|
|
10196
10340
|
// No expected value means no comparison needed
|
|
10197
10341
|
weight,
|
|
10198
10342
|
hit: true,
|
|
10199
|
-
message: `${
|
|
10343
|
+
message: `${path42}: no expected value`
|
|
10200
10344
|
};
|
|
10201
10345
|
}
|
|
10202
10346
|
if (candidateValue === void 0) {
|
|
10203
10347
|
if (required) {
|
|
10204
10348
|
return {
|
|
10205
|
-
path:
|
|
10349
|
+
path: path42,
|
|
10206
10350
|
score: 0,
|
|
10207
10351
|
weight,
|
|
10208
10352
|
hit: false,
|
|
10209
|
-
message: `${
|
|
10353
|
+
message: `${path42} (required, missing)`
|
|
10210
10354
|
};
|
|
10211
10355
|
}
|
|
10212
10356
|
return {
|
|
10213
|
-
path:
|
|
10357
|
+
path: path42,
|
|
10214
10358
|
score: 1,
|
|
10215
10359
|
// Don't penalize missing optional fields
|
|
10216
10360
|
weight: 0,
|
|
10217
10361
|
// Zero weight means it won't affect the score
|
|
10218
10362
|
hit: true,
|
|
10219
|
-
message: `${
|
|
10363
|
+
message: `${path42}: optional field missing`
|
|
10220
10364
|
};
|
|
10221
10365
|
}
|
|
10222
10366
|
switch (match) {
|
|
10223
10367
|
case "exact":
|
|
10224
|
-
return this.compareExact(
|
|
10368
|
+
return this.compareExact(path42, candidateValue, expectedValue, weight);
|
|
10225
10369
|
case "numeric_tolerance":
|
|
10226
10370
|
return this.compareNumericTolerance(
|
|
10227
|
-
|
|
10371
|
+
path42,
|
|
10228
10372
|
candidateValue,
|
|
10229
10373
|
expectedValue,
|
|
10230
10374
|
fieldConfig,
|
|
10231
10375
|
weight
|
|
10232
10376
|
);
|
|
10233
10377
|
case "date":
|
|
10234
|
-
return this.compareDate(
|
|
10378
|
+
return this.compareDate(path42, candidateValue, expectedValue, fieldConfig, weight);
|
|
10235
10379
|
default:
|
|
10236
10380
|
return {
|
|
10237
|
-
path:
|
|
10381
|
+
path: path42,
|
|
10238
10382
|
score: 0,
|
|
10239
10383
|
weight,
|
|
10240
10384
|
hit: false,
|
|
10241
|
-
message: `${
|
|
10385
|
+
message: `${path42}: unknown match type "${match}"`
|
|
10242
10386
|
};
|
|
10243
10387
|
}
|
|
10244
10388
|
}
|
|
10245
10389
|
/**
|
|
10246
10390
|
* Exact equality comparison.
|
|
10247
10391
|
*/
|
|
10248
|
-
compareExact(
|
|
10392
|
+
compareExact(path42, candidateValue, expectedValue, weight) {
|
|
10249
10393
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
10250
10394
|
return {
|
|
10251
|
-
path:
|
|
10395
|
+
path: path42,
|
|
10252
10396
|
score: 1,
|
|
10253
10397
|
weight,
|
|
10254
10398
|
hit: true,
|
|
10255
|
-
message:
|
|
10399
|
+
message: path42
|
|
10256
10400
|
};
|
|
10257
10401
|
}
|
|
10258
10402
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
10259
10403
|
return {
|
|
10260
|
-
path:
|
|
10404
|
+
path: path42,
|
|
10261
10405
|
score: 0,
|
|
10262
10406
|
weight,
|
|
10263
10407
|
hit: false,
|
|
10264
|
-
message: `${
|
|
10408
|
+
message: `${path42} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
10265
10409
|
};
|
|
10266
10410
|
}
|
|
10267
10411
|
return {
|
|
10268
|
-
path:
|
|
10412
|
+
path: path42,
|
|
10269
10413
|
score: 0,
|
|
10270
10414
|
weight,
|
|
10271
10415
|
hit: false,
|
|
10272
|
-
message: `${
|
|
10416
|
+
message: `${path42} (value mismatch)`
|
|
10273
10417
|
};
|
|
10274
10418
|
}
|
|
10275
10419
|
/**
|
|
10276
10420
|
* Numeric comparison with absolute or relative tolerance.
|
|
10277
10421
|
*/
|
|
10278
|
-
compareNumericTolerance(
|
|
10422
|
+
compareNumericTolerance(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
10279
10423
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
10280
10424
|
const candidateNum = toNumber2(candidateValue);
|
|
10281
10425
|
const expectedNum = toNumber2(expectedValue);
|
|
10282
10426
|
if (candidateNum === null || expectedNum === null) {
|
|
10283
10427
|
return {
|
|
10284
|
-
path:
|
|
10428
|
+
path: path42,
|
|
10285
10429
|
score: 0,
|
|
10286
10430
|
weight,
|
|
10287
10431
|
hit: false,
|
|
10288
|
-
message: `${
|
|
10432
|
+
message: `${path42} (non-numeric value)`
|
|
10289
10433
|
};
|
|
10290
10434
|
}
|
|
10291
10435
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
10292
10436
|
return {
|
|
10293
|
-
path:
|
|
10437
|
+
path: path42,
|
|
10294
10438
|
score: 0,
|
|
10295
10439
|
weight,
|
|
10296
10440
|
hit: false,
|
|
10297
|
-
message: `${
|
|
10441
|
+
message: `${path42} (invalid numeric value)`
|
|
10298
10442
|
};
|
|
10299
10443
|
}
|
|
10300
10444
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -10307,61 +10451,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
10307
10451
|
}
|
|
10308
10452
|
if (withinTolerance) {
|
|
10309
10453
|
return {
|
|
10310
|
-
path:
|
|
10454
|
+
path: path42,
|
|
10311
10455
|
score: 1,
|
|
10312
10456
|
weight,
|
|
10313
10457
|
hit: true,
|
|
10314
|
-
message: `${
|
|
10458
|
+
message: `${path42} (within tolerance: diff=${diff.toFixed(2)})`
|
|
10315
10459
|
};
|
|
10316
10460
|
}
|
|
10317
10461
|
return {
|
|
10318
|
-
path:
|
|
10462
|
+
path: path42,
|
|
10319
10463
|
score: 0,
|
|
10320
10464
|
weight,
|
|
10321
10465
|
hit: false,
|
|
10322
|
-
message: `${
|
|
10466
|
+
message: `${path42} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
10323
10467
|
};
|
|
10324
10468
|
}
|
|
10325
10469
|
/**
|
|
10326
10470
|
* Date comparison with format normalization.
|
|
10327
10471
|
*/
|
|
10328
|
-
compareDate(
|
|
10472
|
+
compareDate(path42, candidateValue, expectedValue, fieldConfig, weight) {
|
|
10329
10473
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
10330
10474
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
10331
10475
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
10332
10476
|
if (candidateDate === null) {
|
|
10333
10477
|
return {
|
|
10334
|
-
path:
|
|
10478
|
+
path: path42,
|
|
10335
10479
|
score: 0,
|
|
10336
10480
|
weight,
|
|
10337
10481
|
hit: false,
|
|
10338
|
-
message: `${
|
|
10482
|
+
message: `${path42} (unparseable candidate date)`
|
|
10339
10483
|
};
|
|
10340
10484
|
}
|
|
10341
10485
|
if (expectedDate === null) {
|
|
10342
10486
|
return {
|
|
10343
|
-
path:
|
|
10487
|
+
path: path42,
|
|
10344
10488
|
score: 0,
|
|
10345
10489
|
weight,
|
|
10346
10490
|
hit: false,
|
|
10347
|
-
message: `${
|
|
10491
|
+
message: `${path42} (unparseable expected date)`
|
|
10348
10492
|
};
|
|
10349
10493
|
}
|
|
10350
10494
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
10351
10495
|
return {
|
|
10352
|
-
path:
|
|
10496
|
+
path: path42,
|
|
10353
10497
|
score: 1,
|
|
10354
10498
|
weight,
|
|
10355
10499
|
hit: true,
|
|
10356
|
-
message:
|
|
10500
|
+
message: path42
|
|
10357
10501
|
};
|
|
10358
10502
|
}
|
|
10359
10503
|
return {
|
|
10360
|
-
path:
|
|
10504
|
+
path: path42,
|
|
10361
10505
|
score: 0,
|
|
10362
10506
|
weight,
|
|
10363
10507
|
hit: false,
|
|
10364
|
-
message: `${
|
|
10508
|
+
message: `${path42} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
10365
10509
|
};
|
|
10366
10510
|
}
|
|
10367
10511
|
/**
|
|
@@ -10402,11 +10546,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
10402
10546
|
};
|
|
10403
10547
|
}
|
|
10404
10548
|
};
|
|
10405
|
-
function resolvePath(obj,
|
|
10406
|
-
if (!
|
|
10549
|
+
function resolvePath(obj, path42) {
|
|
10550
|
+
if (!path42 || !obj) {
|
|
10407
10551
|
return void 0;
|
|
10408
10552
|
}
|
|
10409
|
-
const parts =
|
|
10553
|
+
const parts = path42.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
10410
10554
|
let current = obj;
|
|
10411
10555
|
for (const part of parts) {
|
|
10412
10556
|
if (current === null || current === void 0) {
|
|
@@ -11224,8 +11368,8 @@ var TokenUsageEvaluator = class {
|
|
|
11224
11368
|
};
|
|
11225
11369
|
|
|
11226
11370
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
11227
|
-
function getNestedValue(obj,
|
|
11228
|
-
const parts =
|
|
11371
|
+
function getNestedValue(obj, path42) {
|
|
11372
|
+
const parts = path42.split(".");
|
|
11229
11373
|
let current = obj;
|
|
11230
11374
|
for (const part of parts) {
|
|
11231
11375
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -11786,9 +11930,9 @@ function runEqualsAssertion(output, value) {
|
|
|
11786
11930
|
}
|
|
11787
11931
|
|
|
11788
11932
|
// src/evaluation/orchestrator.ts
|
|
11789
|
-
import { createHash as
|
|
11790
|
-
import { mkdir as
|
|
11791
|
-
import
|
|
11933
|
+
import { createHash as createHash3, randomUUID as randomUUID7 } from "node:crypto";
|
|
11934
|
+
import { mkdir as mkdir13, stat as stat7 } from "node:fs/promises";
|
|
11935
|
+
import path39 from "node:path";
|
|
11792
11936
|
import micromatch4 from "micromatch";
|
|
11793
11937
|
|
|
11794
11938
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -12658,17 +12802,283 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
12658
12802
|
}
|
|
12659
12803
|
}
|
|
12660
12804
|
|
|
12661
|
-
// src/evaluation/workspace/
|
|
12805
|
+
// src/evaluation/workspace/pool-manager.ts
|
|
12662
12806
|
import { execFile } from "node:child_process";
|
|
12663
12807
|
import { createHash } from "node:crypto";
|
|
12664
12808
|
import { existsSync as existsSync2 } from "node:fs";
|
|
12665
|
-
import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
|
|
12809
|
+
import { cp as cp2, mkdir as mkdir11, readFile as readFile11, readdir as readdir4, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
|
|
12666
12810
|
import path36 from "node:path";
|
|
12667
12811
|
import { promisify as promisify5 } from "node:util";
|
|
12668
12812
|
var execFileAsync = promisify5(execFile);
|
|
12813
|
+
function gitEnv() {
|
|
12814
|
+
const env = { ...process.env };
|
|
12815
|
+
for (const key of Object.keys(env)) {
|
|
12816
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
12817
|
+
delete env[key];
|
|
12818
|
+
}
|
|
12819
|
+
}
|
|
12820
|
+
return {
|
|
12821
|
+
...env,
|
|
12822
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
12823
|
+
GIT_ASKPASS: "",
|
|
12824
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
12825
|
+
};
|
|
12826
|
+
}
|
|
12827
|
+
async function git(args, opts) {
|
|
12828
|
+
const { stdout } = await execFileAsync("git", args, {
|
|
12829
|
+
cwd: opts?.cwd,
|
|
12830
|
+
timeout: opts?.timeout ?? 3e5,
|
|
12831
|
+
env: gitEnv(),
|
|
12832
|
+
maxBuffer: 50 * 1024 * 1024
|
|
12833
|
+
});
|
|
12834
|
+
return stdout.trim();
|
|
12835
|
+
}
|
|
12836
|
+
function normalizeRepoForFingerprint(repo) {
|
|
12837
|
+
const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
|
|
12838
|
+
const result = {
|
|
12839
|
+
path: repo.path,
|
|
12840
|
+
source,
|
|
12841
|
+
ref: repo.checkout?.ref ?? "HEAD"
|
|
12842
|
+
};
|
|
12843
|
+
if (repo.clone?.depth !== void 0) {
|
|
12844
|
+
result.depth = repo.clone.depth;
|
|
12845
|
+
}
|
|
12846
|
+
if (repo.clone?.filter !== void 0) {
|
|
12847
|
+
result.filter = repo.clone.filter;
|
|
12848
|
+
}
|
|
12849
|
+
if (repo.clone?.sparse?.length) {
|
|
12850
|
+
result.sparse = [...repo.clone.sparse].sort();
|
|
12851
|
+
}
|
|
12852
|
+
return result;
|
|
12853
|
+
}
|
|
12854
|
+
function computeWorkspaceFingerprint(templatePath, repos) {
|
|
12855
|
+
const canonical = {
|
|
12856
|
+
templatePath: templatePath ?? null,
|
|
12857
|
+
repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
|
|
12858
|
+
};
|
|
12859
|
+
return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
12860
|
+
}
|
|
12861
|
+
async function copyDirectoryRecursive2(src, dest, skipDirs) {
|
|
12862
|
+
await mkdir11(dest, { recursive: true });
|
|
12863
|
+
const entries = await readdir4(src, { withFileTypes: true });
|
|
12864
|
+
for (const entry of entries) {
|
|
12865
|
+
const srcPath = path36.join(src, entry.name);
|
|
12866
|
+
const destPath = path36.join(dest, entry.name);
|
|
12867
|
+
if (entry.name === ".git") {
|
|
12868
|
+
continue;
|
|
12869
|
+
}
|
|
12870
|
+
if (entry.isDirectory()) {
|
|
12871
|
+
if (skipDirs?.has(entry.name)) {
|
|
12872
|
+
continue;
|
|
12873
|
+
}
|
|
12874
|
+
await copyDirectoryRecursive2(srcPath, destPath, skipDirs);
|
|
12875
|
+
} else {
|
|
12876
|
+
await cp2(srcPath, destPath, { preserveTimestamps: true, force: true });
|
|
12877
|
+
}
|
|
12878
|
+
}
|
|
12879
|
+
}
|
|
12880
|
+
var WorkspacePoolManager = class {
|
|
12881
|
+
poolRoot;
|
|
12882
|
+
constructor(poolRoot) {
|
|
12883
|
+
this.poolRoot = poolRoot ?? getWorkspacePoolRoot();
|
|
12884
|
+
}
|
|
12885
|
+
/**
|
|
12886
|
+
* Acquire a workspace slot from the pool.
|
|
12887
|
+
*
|
|
12888
|
+
* 1. Compute fingerprint from template + repos
|
|
12889
|
+
* 2. Check drift (compare stored metadata.json fingerprint vs computed)
|
|
12890
|
+
* 3. If drift: warn, remove all slots, rematerialize
|
|
12891
|
+
* 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
|
|
12892
|
+
* 5. If slot exists: reset repos, re-copy template files (skip repo directories)
|
|
12893
|
+
* 6. If new slot: copy template, materialize all repos, write metadata.json
|
|
12894
|
+
* 7. Return the slot (with path, index, isExisting)
|
|
12895
|
+
*/
|
|
12896
|
+
async acquireWorkspace(options) {
|
|
12897
|
+
const { templatePath, repos, maxSlots, repoManager } = options;
|
|
12898
|
+
const fingerprint = computeWorkspaceFingerprint(templatePath, repos);
|
|
12899
|
+
const poolDir = path36.join(this.poolRoot, fingerprint);
|
|
12900
|
+
await mkdir11(poolDir, { recursive: true });
|
|
12901
|
+
const drifted = await this.checkDrift(poolDir, fingerprint);
|
|
12902
|
+
if (drifted) {
|
|
12903
|
+
console.warn(
|
|
12904
|
+
`[workspace-pool] Drift detected for fingerprint ${fingerprint.slice(0, 12)}... Removing stale slots.`
|
|
12905
|
+
);
|
|
12906
|
+
await this.removeAllSlots(poolDir);
|
|
12907
|
+
}
|
|
12908
|
+
for (let i = 0; i < maxSlots; i++) {
|
|
12909
|
+
const slotPath = path36.join(poolDir, `slot-${i}`);
|
|
12910
|
+
const lockPath = `${slotPath}.lock`;
|
|
12911
|
+
const locked = await this.tryLock(lockPath);
|
|
12912
|
+
if (!locked) {
|
|
12913
|
+
continue;
|
|
12914
|
+
}
|
|
12915
|
+
const slotExists = existsSync2(slotPath);
|
|
12916
|
+
if (slotExists) {
|
|
12917
|
+
await this.resetSlot(slotPath, templatePath, repos);
|
|
12918
|
+
return {
|
|
12919
|
+
index: i,
|
|
12920
|
+
path: slotPath,
|
|
12921
|
+
isExisting: true,
|
|
12922
|
+
lockPath,
|
|
12923
|
+
fingerprint,
|
|
12924
|
+
poolDir
|
|
12925
|
+
};
|
|
12926
|
+
}
|
|
12927
|
+
await mkdir11(slotPath, { recursive: true });
|
|
12928
|
+
if (templatePath) {
|
|
12929
|
+
await copyDirectoryRecursive2(templatePath, slotPath);
|
|
12930
|
+
}
|
|
12931
|
+
if (repos.length > 0) {
|
|
12932
|
+
await repoManager.materializeAll(repos, slotPath);
|
|
12933
|
+
}
|
|
12934
|
+
await this.writeMetadata(poolDir, fingerprint, templatePath ?? null, repos);
|
|
12935
|
+
return {
|
|
12936
|
+
index: i,
|
|
12937
|
+
path: slotPath,
|
|
12938
|
+
isExisting: false,
|
|
12939
|
+
lockPath,
|
|
12940
|
+
fingerprint,
|
|
12941
|
+
poolDir
|
|
12942
|
+
};
|
|
12943
|
+
}
|
|
12944
|
+
throw new Error(
|
|
12945
|
+
`All ${maxSlots} pool slots are locked for fingerprint ${fingerprint.slice(0, 12)}...`
|
|
12946
|
+
);
|
|
12947
|
+
}
|
|
12948
|
+
/** Remove lock file to release a slot. */
|
|
12949
|
+
async releaseSlot(slot) {
|
|
12950
|
+
try {
|
|
12951
|
+
await unlink(slot.lockPath);
|
|
12952
|
+
} catch {
|
|
12953
|
+
}
|
|
12954
|
+
}
|
|
12955
|
+
/**
|
|
12956
|
+
* Try to acquire a PID-based lock file.
|
|
12957
|
+
* On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
|
|
12958
|
+
* Returns true if lock acquired, false if slot is actively locked.
|
|
12959
|
+
* Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
|
|
12960
|
+
*/
|
|
12961
|
+
async tryLock(lockPath) {
|
|
12962
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
12963
|
+
try {
|
|
12964
|
+
await writeFile7(lockPath, String(process.pid), { flag: "wx" });
|
|
12965
|
+
return true;
|
|
12966
|
+
} catch (err) {
|
|
12967
|
+
if (err.code !== "EEXIST") {
|
|
12968
|
+
throw err;
|
|
12969
|
+
}
|
|
12970
|
+
try {
|
|
12971
|
+
const pidStr = await readFile11(lockPath, "utf-8");
|
|
12972
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
12973
|
+
if (!Number.isNaN(pid)) {
|
|
12974
|
+
try {
|
|
12975
|
+
process.kill(pid, 0);
|
|
12976
|
+
return false;
|
|
12977
|
+
} catch {
|
|
12978
|
+
await unlink(lockPath).catch(() => {
|
|
12979
|
+
});
|
|
12980
|
+
continue;
|
|
12981
|
+
}
|
|
12982
|
+
}
|
|
12983
|
+
} catch {
|
|
12984
|
+
}
|
|
12985
|
+
return false;
|
|
12986
|
+
}
|
|
12987
|
+
}
|
|
12988
|
+
return false;
|
|
12989
|
+
}
|
|
12990
|
+
/**
|
|
12991
|
+
* Check if the stored fingerprint in metadata.json differs from the computed one.
|
|
12992
|
+
* Returns true if drifted, false otherwise.
|
|
12993
|
+
* Returns false (no drift) if metadata.json doesn't exist (first use).
|
|
12994
|
+
*/
|
|
12995
|
+
async checkDrift(poolDir, fingerprint) {
|
|
12996
|
+
const metadataPath = path36.join(poolDir, "metadata.json");
|
|
12997
|
+
try {
|
|
12998
|
+
const raw = await readFile11(metadataPath, "utf-8");
|
|
12999
|
+
const metadata = JSON.parse(raw);
|
|
13000
|
+
return metadata.fingerprint !== fingerprint;
|
|
13001
|
+
} catch {
|
|
13002
|
+
return false;
|
|
13003
|
+
}
|
|
13004
|
+
}
|
|
13005
|
+
/** Write metadata.json with fingerprint, inputs, and timestamp. */
|
|
13006
|
+
async writeMetadata(poolDir, fingerprint, templatePath, repos) {
|
|
13007
|
+
const metadata = {
|
|
13008
|
+
fingerprint,
|
|
13009
|
+
templatePath,
|
|
13010
|
+
repos,
|
|
13011
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
13012
|
+
};
|
|
13013
|
+
await writeFile7(path36.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
13014
|
+
}
|
|
13015
|
+
/** Remove all slot directories and their lock files from a pool directory. */
|
|
13016
|
+
async removeAllSlots(poolDir) {
|
|
13017
|
+
const entries = await readdir4(poolDir);
|
|
13018
|
+
for (const entry of entries) {
|
|
13019
|
+
if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
|
|
13020
|
+
const lockPath = path36.join(poolDir, `${entry}.lock`);
|
|
13021
|
+
if (existsSync2(lockPath)) {
|
|
13022
|
+
try {
|
|
13023
|
+
const pidStr = await readFile11(lockPath, "utf-8");
|
|
13024
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
13025
|
+
if (!Number.isNaN(pid)) {
|
|
13026
|
+
try {
|
|
13027
|
+
process.kill(pid, 0);
|
|
13028
|
+
console.warn(`[workspace-pool] Skipping slot ${entry}: locked by PID ${pid}`);
|
|
13029
|
+
continue;
|
|
13030
|
+
} catch {
|
|
13031
|
+
}
|
|
13032
|
+
}
|
|
13033
|
+
} catch {
|
|
13034
|
+
}
|
|
13035
|
+
}
|
|
13036
|
+
await rm5(path36.join(poolDir, entry), { recursive: true, force: true });
|
|
13037
|
+
await rm5(lockPath, { force: true }).catch(() => {
|
|
13038
|
+
});
|
|
13039
|
+
}
|
|
13040
|
+
}
|
|
13041
|
+
await rm5(path36.join(poolDir, "metadata.json"), { force: true }).catch(() => {
|
|
13042
|
+
});
|
|
13043
|
+
}
|
|
13044
|
+
/**
|
|
13045
|
+
* Reset an existing slot for reuse:
|
|
13046
|
+
* 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
|
|
13047
|
+
* 2. Re-copy template files (skip repo directories)
|
|
13048
|
+
*/
|
|
13049
|
+
async resetSlot(slotPath, templatePath, repos) {
|
|
13050
|
+
for (const repo of repos) {
|
|
13051
|
+
const repoDir = path36.join(slotPath, repo.path);
|
|
13052
|
+
if (!existsSync2(repoDir)) {
|
|
13053
|
+
continue;
|
|
13054
|
+
}
|
|
13055
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
13056
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
13057
|
+
await git(["clean", "-fd"], { cwd: repoDir });
|
|
13058
|
+
}
|
|
13059
|
+
if (templatePath) {
|
|
13060
|
+
const repoDirNames = new Set(
|
|
13061
|
+
repos.map((r) => {
|
|
13062
|
+
const normalized = r.path.replace(/^\.\//, "");
|
|
13063
|
+
return normalized.split("/")[0];
|
|
13064
|
+
})
|
|
13065
|
+
);
|
|
13066
|
+
await copyDirectoryRecursive2(templatePath, slotPath, repoDirNames);
|
|
13067
|
+
}
|
|
13068
|
+
}
|
|
13069
|
+
};
|
|
13070
|
+
|
|
13071
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
13072
|
+
import { execFile as execFile2 } from "node:child_process";
|
|
13073
|
+
import { createHash as createHash2 } from "node:crypto";
|
|
13074
|
+
import { existsSync as existsSync3 } from "node:fs";
|
|
13075
|
+
import { mkdir as mkdir12, rm as rm6, unlink as unlink2, writeFile as writeFile8 } from "node:fs/promises";
|
|
13076
|
+
import path37 from "node:path";
|
|
13077
|
+
import { promisify as promisify6 } from "node:util";
|
|
13078
|
+
var execFileAsync2 = promisify6(execFile2);
|
|
12669
13079
|
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
12670
13080
|
var LOCK_TIMEOUT_MS = 6e4;
|
|
12671
|
-
function
|
|
13081
|
+
function gitEnv2() {
|
|
12672
13082
|
const env = { ...process.env };
|
|
12673
13083
|
for (const key of Object.keys(env)) {
|
|
12674
13084
|
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
@@ -12684,16 +13094,16 @@ function gitEnv() {
|
|
|
12684
13094
|
}
|
|
12685
13095
|
function cacheKey(source) {
|
|
12686
13096
|
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
12687
|
-
return
|
|
13097
|
+
return createHash2("sha256").update(raw).digest("hex");
|
|
12688
13098
|
}
|
|
12689
13099
|
function getSourceUrl(source) {
|
|
12690
13100
|
return source.type === "git" ? source.url : source.path;
|
|
12691
13101
|
}
|
|
12692
|
-
async function
|
|
12693
|
-
const { stdout } = await
|
|
13102
|
+
async function git2(args, opts) {
|
|
13103
|
+
const { stdout } = await execFileAsync2("git", args, {
|
|
12694
13104
|
cwd: opts?.cwd,
|
|
12695
13105
|
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
12696
|
-
env:
|
|
13106
|
+
env: gitEnv2(),
|
|
12697
13107
|
maxBuffer: 50 * 1024 * 1024
|
|
12698
13108
|
// 50MB
|
|
12699
13109
|
});
|
|
@@ -12703,7 +13113,7 @@ async function acquireLock(lockPath) {
|
|
|
12703
13113
|
const start = Date.now();
|
|
12704
13114
|
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
12705
13115
|
try {
|
|
12706
|
-
await
|
|
13116
|
+
await writeFile8(lockPath, String(process.pid), { flag: "wx" });
|
|
12707
13117
|
return;
|
|
12708
13118
|
} catch (err) {
|
|
12709
13119
|
if (err.code === "EEXIST") {
|
|
@@ -12717,7 +13127,7 @@ async function acquireLock(lockPath) {
|
|
|
12717
13127
|
}
|
|
12718
13128
|
async function releaseLock(lockPath) {
|
|
12719
13129
|
try {
|
|
12720
|
-
await
|
|
13130
|
+
await unlink2(lockPath);
|
|
12721
13131
|
} catch {
|
|
12722
13132
|
}
|
|
12723
13133
|
}
|
|
@@ -12731,16 +13141,12 @@ var RepoManager = class {
|
|
|
12731
13141
|
async runGit(args, opts) {
|
|
12732
13142
|
const startedAt = Date.now();
|
|
12733
13143
|
if (this.verbose) {
|
|
12734
|
-
console.log(
|
|
12735
|
-
`[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`
|
|
12736
|
-
);
|
|
13144
|
+
console.log(`[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`);
|
|
12737
13145
|
}
|
|
12738
13146
|
try {
|
|
12739
|
-
const output = await
|
|
13147
|
+
const output = await git2(args, opts);
|
|
12740
13148
|
if (this.verbose) {
|
|
12741
|
-
console.log(
|
|
12742
|
-
`[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`
|
|
12743
|
-
);
|
|
13149
|
+
console.log(`[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`);
|
|
12744
13150
|
}
|
|
12745
13151
|
return output;
|
|
12746
13152
|
} catch (error) {
|
|
@@ -12760,9 +13166,9 @@ var RepoManager = class {
|
|
|
12760
13166
|
*/
|
|
12761
13167
|
async ensureCache(source, depth, resolve) {
|
|
12762
13168
|
const key = cacheKey(source);
|
|
12763
|
-
const cachePath =
|
|
13169
|
+
const cachePath = path37.join(this.cacheDir, key);
|
|
12764
13170
|
const lockPath = `${cachePath}.lock`;
|
|
12765
|
-
const cacheExists =
|
|
13171
|
+
const cacheExists = existsSync3(path37.join(cachePath, "HEAD"));
|
|
12766
13172
|
if (this.verbose) {
|
|
12767
13173
|
console.log(
|
|
12768
13174
|
`[repo] ensureCache source=${getSourceUrl(source)} resolve=${resolve ?? "remote"} cache=${cachePath}`
|
|
@@ -12780,13 +13186,11 @@ var RepoManager = class {
|
|
|
12780
13186
|
`No cache found for \`${url}\`. Run \`agentv cache add --url ${url} --from <local-path>\` to seed it.`
|
|
12781
13187
|
);
|
|
12782
13188
|
}
|
|
12783
|
-
await
|
|
13189
|
+
await mkdir12(this.cacheDir, { recursive: true });
|
|
12784
13190
|
const lockStartedAt = Date.now();
|
|
12785
13191
|
await acquireLock(lockPath);
|
|
12786
13192
|
if (this.verbose) {
|
|
12787
|
-
console.log(
|
|
12788
|
-
`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`
|
|
12789
|
-
);
|
|
13193
|
+
console.log(`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`);
|
|
12790
13194
|
}
|
|
12791
13195
|
try {
|
|
12792
13196
|
if (cacheExists) {
|
|
@@ -12824,7 +13228,7 @@ var RepoManager = class {
|
|
|
12824
13228
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
12825
13229
|
*/
|
|
12826
13230
|
async materialize(repo, workspacePath) {
|
|
12827
|
-
const targetDir =
|
|
13231
|
+
const targetDir = path37.join(workspacePath, repo.path);
|
|
12828
13232
|
const startedAt = Date.now();
|
|
12829
13233
|
if (this.verbose) {
|
|
12830
13234
|
console.log(
|
|
@@ -12919,14 +13323,14 @@ var RepoManager = class {
|
|
|
12919
13323
|
async reset(repos, workspacePath, strategy) {
|
|
12920
13324
|
if (strategy === "recreate") {
|
|
12921
13325
|
for (const repo of repos) {
|
|
12922
|
-
const targetDir =
|
|
12923
|
-
await
|
|
13326
|
+
const targetDir = path37.join(workspacePath, repo.path);
|
|
13327
|
+
await rm6(targetDir, { recursive: true, force: true });
|
|
12924
13328
|
}
|
|
12925
13329
|
await this.materializeAll(repos, workspacePath);
|
|
12926
13330
|
return;
|
|
12927
13331
|
}
|
|
12928
13332
|
for (const repo of repos) {
|
|
12929
|
-
const targetDir =
|
|
13333
|
+
const targetDir = path37.join(workspacePath, repo.path);
|
|
12930
13334
|
await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
12931
13335
|
await this.runGit(["clean", "-fd"], { cwd: targetDir });
|
|
12932
13336
|
}
|
|
@@ -12938,21 +13342,21 @@ var RepoManager = class {
|
|
|
12938
13342
|
async seedCache(localPath, remoteUrl, opts) {
|
|
12939
13343
|
const source = { type: "git", url: remoteUrl };
|
|
12940
13344
|
const key = cacheKey(source);
|
|
12941
|
-
const cachePath =
|
|
13345
|
+
const cachePath = path37.join(this.cacheDir, key);
|
|
12942
13346
|
const lockPath = `${cachePath}.lock`;
|
|
12943
|
-
await
|
|
13347
|
+
await mkdir12(this.cacheDir, { recursive: true });
|
|
12944
13348
|
await acquireLock(lockPath);
|
|
12945
13349
|
try {
|
|
12946
|
-
if (
|
|
13350
|
+
if (existsSync3(path37.join(cachePath, "HEAD"))) {
|
|
12947
13351
|
if (!opts?.force) {
|
|
12948
13352
|
throw new Error(
|
|
12949
13353
|
`Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
|
|
12950
13354
|
);
|
|
12951
13355
|
}
|
|
12952
|
-
await
|
|
13356
|
+
await rm6(cachePath, { recursive: true, force: true });
|
|
12953
13357
|
}
|
|
12954
|
-
await
|
|
12955
|
-
await
|
|
13358
|
+
await git2(["clone", "--mirror", "--bare", localPath, cachePath]);
|
|
13359
|
+
await git2(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
|
|
12956
13360
|
} finally {
|
|
12957
13361
|
await releaseLock(lockPath);
|
|
12958
13362
|
}
|
|
@@ -12960,41 +13364,41 @@ var RepoManager = class {
|
|
|
12960
13364
|
}
|
|
12961
13365
|
/** Remove the entire cache directory. */
|
|
12962
13366
|
async cleanCache() {
|
|
12963
|
-
await
|
|
13367
|
+
await rm6(this.cacheDir, { recursive: true, force: true });
|
|
12964
13368
|
}
|
|
12965
13369
|
};
|
|
12966
13370
|
|
|
12967
13371
|
// src/evaluation/workspace/resolve.ts
|
|
12968
|
-
import { readdir as
|
|
12969
|
-
import
|
|
13372
|
+
import { readdir as readdir5, stat as stat6 } from "node:fs/promises";
|
|
13373
|
+
import path38 from "node:path";
|
|
12970
13374
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
12971
13375
|
if (!templatePath) {
|
|
12972
13376
|
return void 0;
|
|
12973
13377
|
}
|
|
12974
|
-
const resolved =
|
|
13378
|
+
const resolved = path38.resolve(templatePath);
|
|
12975
13379
|
const stats = await stat6(resolved);
|
|
12976
13380
|
if (stats.isFile()) {
|
|
12977
13381
|
return {
|
|
12978
|
-
dir:
|
|
13382
|
+
dir: path38.dirname(resolved),
|
|
12979
13383
|
workspaceFile: resolved
|
|
12980
13384
|
};
|
|
12981
13385
|
}
|
|
12982
13386
|
if (!stats.isDirectory()) {
|
|
12983
13387
|
throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
|
|
12984
13388
|
}
|
|
12985
|
-
const entries = await
|
|
13389
|
+
const entries = await readdir5(resolved);
|
|
12986
13390
|
const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
|
|
12987
13391
|
if (workspaceFiles.length === 1) {
|
|
12988
13392
|
return {
|
|
12989
13393
|
dir: resolved,
|
|
12990
|
-
workspaceFile:
|
|
13394
|
+
workspaceFile: path38.join(resolved, workspaceFiles[0])
|
|
12991
13395
|
};
|
|
12992
13396
|
}
|
|
12993
13397
|
if (workspaceFiles.length > 1) {
|
|
12994
13398
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
12995
13399
|
return {
|
|
12996
13400
|
dir: resolved,
|
|
12997
|
-
workspaceFile: conventionFile ?
|
|
13401
|
+
workspaceFile: conventionFile ? path38.join(resolved, conventionFile) : void 0
|
|
12998
13402
|
};
|
|
12999
13403
|
}
|
|
13000
13404
|
return { dir: resolved };
|
|
@@ -13076,7 +13480,10 @@ async function runEvaluation(options) {
|
|
|
13076
13480
|
trials,
|
|
13077
13481
|
streamCallbacks,
|
|
13078
13482
|
totalBudgetUsd,
|
|
13079
|
-
failOnError
|
|
13483
|
+
failOnError,
|
|
13484
|
+
poolWorkspaces,
|
|
13485
|
+
poolMaxSlots: configPoolMaxSlots,
|
|
13486
|
+
workspace: userWorkspacePath
|
|
13080
13487
|
} = options;
|
|
13081
13488
|
let useCache = options.useCache;
|
|
13082
13489
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -13150,7 +13557,7 @@ async function runEvaluation(options) {
|
|
|
13150
13557
|
];
|
|
13151
13558
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
13152
13559
|
const typeRegistry = createBuiltinRegistry();
|
|
13153
|
-
const discoveryBaseDir = evalFilePath ?
|
|
13560
|
+
const discoveryBaseDir = evalFilePath ? path39.dirname(path39.resolve(evalFilePath)) : process.cwd();
|
|
13154
13561
|
const evalDir = discoveryBaseDir;
|
|
13155
13562
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
13156
13563
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
@@ -13212,13 +13619,19 @@ async function runEvaluation(options) {
|
|
|
13212
13619
|
}
|
|
13213
13620
|
};
|
|
13214
13621
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
13215
|
-
|
|
13622
|
+
if (userWorkspacePath && isPerTestIsolation) {
|
|
13623
|
+
throw new Error(
|
|
13624
|
+
"--workspace is incompatible with isolation: per_test. Use isolation: shared (default)."
|
|
13625
|
+
);
|
|
13626
|
+
}
|
|
13627
|
+
const hasSharedWorkspace = !!(userWorkspacePath || workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
13628
|
+
const usePool = poolWorkspaces === true && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !userWorkspacePath;
|
|
13216
13629
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
13217
|
-
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
13630
|
+
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
13218
13631
|
setupLog(
|
|
13219
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
13632
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
13220
13633
|
);
|
|
13221
|
-
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
13634
|
+
if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
|
|
13222
13635
|
console.warn(
|
|
13223
13636
|
`Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
|
|
13224
13637
|
);
|
|
@@ -13227,7 +13640,37 @@ async function runEvaluation(options) {
|
|
|
13227
13640
|
let sharedWorkspacePath;
|
|
13228
13641
|
let sharedBaselineCommit;
|
|
13229
13642
|
let beforeAllOutput;
|
|
13230
|
-
|
|
13643
|
+
let poolManager;
|
|
13644
|
+
let poolSlot;
|
|
13645
|
+
const poolSlots = [];
|
|
13646
|
+
const availablePoolSlots = [];
|
|
13647
|
+
const poolSlotBaselines = /* @__PURE__ */ new Map();
|
|
13648
|
+
const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
|
|
13649
|
+
if (userWorkspacePath) {
|
|
13650
|
+
sharedWorkspacePath = userWorkspacePath;
|
|
13651
|
+
setupLog(`using user-provided workspace: ${userWorkspacePath}`);
|
|
13652
|
+
} else if (usePool && suiteWorkspace?.repos) {
|
|
13653
|
+
const slotsNeeded = workers;
|
|
13654
|
+
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
|
|
13655
|
+
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
|
|
13656
|
+
const poolRepoManager = new RepoManager(void 0, verbose);
|
|
13657
|
+
for (let i = 0; i < slotsNeeded; i++) {
|
|
13658
|
+
const slot = await poolManager.acquireWorkspace({
|
|
13659
|
+
templatePath: workspaceTemplate,
|
|
13660
|
+
repos: suiteWorkspace.repos,
|
|
13661
|
+
maxSlots: poolMaxSlots,
|
|
13662
|
+
repoManager: poolRepoManager
|
|
13663
|
+
});
|
|
13664
|
+
poolSlots.push(slot);
|
|
13665
|
+
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
13666
|
+
}
|
|
13667
|
+
if (slotsNeeded === 1) {
|
|
13668
|
+
poolSlot = poolSlots[0];
|
|
13669
|
+
sharedWorkspacePath = poolSlot.path;
|
|
13670
|
+
} else {
|
|
13671
|
+
availablePoolSlots.push(...poolSlots);
|
|
13672
|
+
}
|
|
13673
|
+
} else if (workspaceTemplate) {
|
|
13231
13674
|
setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
|
|
13232
13675
|
try {
|
|
13233
13676
|
sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
|
|
@@ -13236,288 +13679,344 @@ async function runEvaluation(options) {
|
|
|
13236
13679
|
const message = error instanceof Error ? error.message : String(error);
|
|
13237
13680
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
13238
13681
|
}
|
|
13682
|
+
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
13683
|
+
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
13684
|
+
await mkdir13(sharedWorkspacePath, { recursive: true });
|
|
13685
|
+
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
13686
|
+
}
|
|
13687
|
+
try {
|
|
13239
13688
|
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
13240
|
-
const copiedWorkspaceFile =
|
|
13689
|
+
const copiedWorkspaceFile = path39.join(sharedWorkspacePath, path39.basename(suiteWorkspaceFile));
|
|
13241
13690
|
try {
|
|
13242
13691
|
await stat7(copiedWorkspaceFile);
|
|
13243
13692
|
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
13244
13693
|
} catch {
|
|
13245
13694
|
}
|
|
13246
13695
|
}
|
|
13247
|
-
|
|
13248
|
-
sharedWorkspacePath
|
|
13249
|
-
|
|
13250
|
-
|
|
13251
|
-
|
|
13252
|
-
|
|
13253
|
-
|
|
13254
|
-
|
|
13255
|
-
|
|
13256
|
-
|
|
13257
|
-
|
|
13258
|
-
|
|
13259
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
13260
|
-
if (sharedWorkspacePath) {
|
|
13261
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13262
|
-
});
|
|
13263
|
-
}
|
|
13264
|
-
throw new Error(`Failed to materialize repos: ${message}`);
|
|
13265
|
-
}
|
|
13266
|
-
}
|
|
13267
|
-
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
13268
|
-
const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
|
|
13269
|
-
setupLog(
|
|
13270
|
-
`running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
13271
|
-
);
|
|
13272
|
-
const scriptContext = {
|
|
13273
|
-
workspacePath: sharedWorkspacePath,
|
|
13274
|
-
testId: "__before_all__",
|
|
13275
|
-
evalRunId,
|
|
13276
|
-
evalDir
|
|
13277
|
-
};
|
|
13278
|
-
try {
|
|
13279
|
-
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
13280
|
-
setupLog("shared before_all completed");
|
|
13281
|
-
} catch (error) {
|
|
13282
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
13283
|
-
if (sharedWorkspacePath) {
|
|
13284
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13285
|
-
});
|
|
13286
|
-
}
|
|
13287
|
-
throw new Error(`before_all script failed: ${message}`);
|
|
13288
|
-
}
|
|
13289
|
-
}
|
|
13290
|
-
if (sharedWorkspacePath) {
|
|
13291
|
-
try {
|
|
13292
|
-
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
13293
|
-
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
13294
|
-
} catch {
|
|
13295
|
-
setupLog("shared baseline initialization skipped (non-fatal)");
|
|
13296
|
-
}
|
|
13297
|
-
}
|
|
13298
|
-
let nextWorkerId = 1;
|
|
13299
|
-
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
13300
|
-
let beforeAllOutputAttached = false;
|
|
13301
|
-
let cumulativeBudgetCost = 0;
|
|
13302
|
-
let budgetExhausted = false;
|
|
13303
|
-
let failOnErrorTriggered = false;
|
|
13304
|
-
const promises = filteredEvalCases.map(
|
|
13305
|
-
(evalCase) => limit(async () => {
|
|
13306
|
-
const workerId = nextWorkerId++;
|
|
13307
|
-
workerIdByEvalId.set(evalCase.id, workerId);
|
|
13308
|
-
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
13309
|
-
const budgetResult = {
|
|
13310
|
-
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13311
|
-
testId: evalCase.id,
|
|
13312
|
-
dataset: evalCase.dataset,
|
|
13313
|
-
score: 0,
|
|
13314
|
-
hits: [],
|
|
13315
|
-
misses: [],
|
|
13316
|
-
answer: "",
|
|
13317
|
-
target: target.name,
|
|
13318
|
-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13319
|
-
budgetExceeded: true,
|
|
13320
|
-
executionStatus: "execution_error",
|
|
13321
|
-
failureStage: "setup",
|
|
13322
|
-
failureReasonCode: "budget_exceeded",
|
|
13323
|
-
executionError: {
|
|
13324
|
-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13325
|
-
stage: "setup"
|
|
13326
|
-
}
|
|
13327
|
-
};
|
|
13328
|
-
if (onProgress) {
|
|
13329
|
-
await onProgress({
|
|
13330
|
-
workerId,
|
|
13331
|
-
testId: evalCase.id,
|
|
13332
|
-
status: "failed",
|
|
13333
|
-
completedAt: Date.now(),
|
|
13334
|
-
error: budgetResult.error
|
|
13696
|
+
const repoManager = suiteWorkspace?.repos?.length && !usePool && !userWorkspacePath ? new RepoManager(void 0, verbose) : void 0;
|
|
13697
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
13698
|
+
setupLog(
|
|
13699
|
+
`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
|
|
13700
|
+
);
|
|
13701
|
+
try {
|
|
13702
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
13703
|
+
setupLog("shared repo materialization complete");
|
|
13704
|
+
} catch (error) {
|
|
13705
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13706
|
+
if (sharedWorkspacePath && !userWorkspacePath) {
|
|
13707
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13335
13708
|
});
|
|
13336
13709
|
}
|
|
13337
|
-
|
|
13338
|
-
await onResult(budgetResult);
|
|
13339
|
-
}
|
|
13340
|
-
return budgetResult;
|
|
13710
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
13341
13711
|
}
|
|
13342
|
-
|
|
13343
|
-
|
|
13344
|
-
|
|
13345
|
-
|
|
13346
|
-
|
|
13347
|
-
|
|
13348
|
-
|
|
13349
|
-
|
|
13350
|
-
|
|
13351
|
-
|
|
13352
|
-
|
|
13353
|
-
|
|
13354
|
-
|
|
13355
|
-
|
|
13356
|
-
|
|
13357
|
-
|
|
13358
|
-
|
|
13359
|
-
if (
|
|
13360
|
-
await
|
|
13361
|
-
workerId,
|
|
13362
|
-
testId: evalCase.id,
|
|
13363
|
-
status: "failed",
|
|
13364
|
-
completedAt: Date.now(),
|
|
13365
|
-
error: haltResult.error
|
|
13712
|
+
}
|
|
13713
|
+
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
13714
|
+
const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
|
|
13715
|
+
setupLog(
|
|
13716
|
+
`running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
13717
|
+
);
|
|
13718
|
+
const scriptContext = {
|
|
13719
|
+
workspacePath: sharedWorkspacePath,
|
|
13720
|
+
testId: "__before_all__",
|
|
13721
|
+
evalRunId,
|
|
13722
|
+
evalDir
|
|
13723
|
+
};
|
|
13724
|
+
try {
|
|
13725
|
+
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
13726
|
+
setupLog("shared before_all completed");
|
|
13727
|
+
} catch (error) {
|
|
13728
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13729
|
+
if (sharedWorkspacePath && !userWorkspacePath) {
|
|
13730
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13366
13731
|
});
|
|
13367
13732
|
}
|
|
13368
|
-
|
|
13369
|
-
await onResult(haltResult);
|
|
13370
|
-
}
|
|
13371
|
-
return haltResult;
|
|
13372
|
-
}
|
|
13373
|
-
if (onProgress) {
|
|
13374
|
-
await onProgress({
|
|
13375
|
-
workerId,
|
|
13376
|
-
testId: evalCase.id,
|
|
13377
|
-
status: "running",
|
|
13378
|
-
startedAt: Date.now()
|
|
13379
|
-
});
|
|
13733
|
+
throw new Error(`before_all script failed: ${message}`);
|
|
13380
13734
|
}
|
|
13381
|
-
|
|
13382
|
-
|
|
13383
|
-
|
|
13384
|
-
|
|
13385
|
-
|
|
13386
|
-
|
|
13387
|
-
|
|
13388
|
-
maxRetries,
|
|
13389
|
-
agentTimeoutMs,
|
|
13390
|
-
cache,
|
|
13391
|
-
useCache,
|
|
13392
|
-
now,
|
|
13393
|
-
judgeProvider,
|
|
13394
|
-
targetResolver,
|
|
13395
|
-
availableTargets,
|
|
13735
|
+
}
|
|
13736
|
+
if (availablePoolSlots.length > 0 && suiteWorkspace?.before_all) {
|
|
13737
|
+
for (const slot of availablePoolSlots) {
|
|
13738
|
+
setupLog(`running before_all on pool slot ${slot.index}`);
|
|
13739
|
+
const scriptContext = {
|
|
13740
|
+
workspacePath: slot.path,
|
|
13741
|
+
testId: "__before_all__",
|
|
13396
13742
|
evalRunId,
|
|
13397
|
-
keepWorkspaces,
|
|
13398
|
-
cleanupWorkspaces,
|
|
13399
|
-
sharedWorkspacePath,
|
|
13400
|
-
sharedBaselineCommit,
|
|
13401
|
-
suiteWorkspaceFile,
|
|
13402
|
-
streamCallbacks,
|
|
13403
|
-
typeRegistry,
|
|
13404
|
-
repoManager,
|
|
13405
13743
|
evalDir
|
|
13406
13744
|
};
|
|
13407
|
-
|
|
13408
|
-
|
|
13409
|
-
|
|
13410
|
-
|
|
13411
|
-
|
|
13412
|
-
|
|
13413
|
-
|
|
13745
|
+
try {
|
|
13746
|
+
const output = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
13747
|
+
if (!beforeAllOutput) beforeAllOutput = output;
|
|
13748
|
+
setupLog(`before_all completed on pool slot ${slot.index}`);
|
|
13749
|
+
} catch (error) {
|
|
13750
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
13751
|
+
throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
|
|
13752
|
+
}
|
|
13753
|
+
}
|
|
13754
|
+
}
|
|
13755
|
+
if (sharedWorkspacePath) {
|
|
13756
|
+
try {
|
|
13757
|
+
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
13758
|
+
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
13759
|
+
} catch {
|
|
13760
|
+
setupLog("shared baseline initialization skipped (non-fatal)");
|
|
13761
|
+
}
|
|
13762
|
+
}
|
|
13763
|
+
if (availablePoolSlots.length > 0) {
|
|
13764
|
+
for (const slot of availablePoolSlots) {
|
|
13765
|
+
try {
|
|
13766
|
+
const baseline = await initializeBaseline(slot.path);
|
|
13767
|
+
poolSlotBaselines.set(slot.path, baseline);
|
|
13768
|
+
setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
|
|
13769
|
+
} catch {
|
|
13770
|
+
setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`);
|
|
13771
|
+
}
|
|
13772
|
+
}
|
|
13773
|
+
}
|
|
13774
|
+
let nextWorkerId = 1;
|
|
13775
|
+
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
13776
|
+
let beforeAllOutputAttached = false;
|
|
13777
|
+
let cumulativeBudgetCost = 0;
|
|
13778
|
+
let budgetExhausted = false;
|
|
13779
|
+
let failOnErrorTriggered = false;
|
|
13780
|
+
const promises = filteredEvalCases.map(
|
|
13781
|
+
(evalCase) => limit(async () => {
|
|
13782
|
+
const workerId = nextWorkerId++;
|
|
13783
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
13784
|
+
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
13785
|
+
const budgetResult = {
|
|
13786
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13787
|
+
testId: evalCase.id,
|
|
13788
|
+
dataset: evalCase.dataset,
|
|
13789
|
+
score: 0,
|
|
13790
|
+
hits: [],
|
|
13791
|
+
misses: [],
|
|
13792
|
+
answer: "",
|
|
13793
|
+
target: target.name,
|
|
13794
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13795
|
+
budgetExceeded: true,
|
|
13796
|
+
executionStatus: "execution_error",
|
|
13797
|
+
failureStage: "setup",
|
|
13798
|
+
failureReasonCode: "budget_exceeded",
|
|
13799
|
+
executionError: {
|
|
13800
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13801
|
+
stage: "setup"
|
|
13414
13802
|
}
|
|
13415
|
-
}
|
|
13416
|
-
|
|
13803
|
+
};
|
|
13804
|
+
if (onProgress) {
|
|
13805
|
+
await onProgress({
|
|
13806
|
+
workerId,
|
|
13807
|
+
testId: evalCase.id,
|
|
13808
|
+
status: "failed",
|
|
13809
|
+
completedAt: Date.now(),
|
|
13810
|
+
error: budgetResult.error
|
|
13811
|
+
});
|
|
13417
13812
|
}
|
|
13418
|
-
if (
|
|
13419
|
-
|
|
13420
|
-
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
13421
|
-
budgetExhausted = true;
|
|
13422
|
-
}
|
|
13813
|
+
if (onResult) {
|
|
13814
|
+
await onResult(budgetResult);
|
|
13423
13815
|
}
|
|
13816
|
+
return budgetResult;
|
|
13424
13817
|
}
|
|
13425
|
-
if (failOnError === true &&
|
|
13426
|
-
|
|
13427
|
-
|
|
13428
|
-
|
|
13429
|
-
|
|
13430
|
-
|
|
13818
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
13819
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
13820
|
+
const haltResult = {
|
|
13821
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13822
|
+
testId: evalCase.id,
|
|
13823
|
+
dataset: evalCase.dataset,
|
|
13824
|
+
score: 0,
|
|
13825
|
+
hits: [],
|
|
13826
|
+
misses: [],
|
|
13827
|
+
answer: "",
|
|
13828
|
+
target: target.name,
|
|
13829
|
+
error: errorMsg,
|
|
13830
|
+
executionStatus: "execution_error",
|
|
13831
|
+
failureStage: "setup",
|
|
13832
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
13833
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
13834
|
+
};
|
|
13835
|
+
if (onProgress) {
|
|
13836
|
+
await onProgress({
|
|
13837
|
+
workerId,
|
|
13838
|
+
testId: evalCase.id,
|
|
13839
|
+
status: "failed",
|
|
13840
|
+
completedAt: Date.now(),
|
|
13841
|
+
error: haltResult.error
|
|
13842
|
+
});
|
|
13843
|
+
}
|
|
13844
|
+
if (onResult) {
|
|
13845
|
+
await onResult(haltResult);
|
|
13846
|
+
}
|
|
13847
|
+
return haltResult;
|
|
13431
13848
|
}
|
|
13432
13849
|
if (onProgress) {
|
|
13433
13850
|
await onProgress({
|
|
13434
13851
|
workerId,
|
|
13435
13852
|
testId: evalCase.id,
|
|
13436
|
-
status:
|
|
13437
|
-
startedAt:
|
|
13438
|
-
// Not used for completed status
|
|
13439
|
-
completedAt: Date.now(),
|
|
13440
|
-
error: result.error
|
|
13853
|
+
status: "running",
|
|
13854
|
+
startedAt: Date.now()
|
|
13441
13855
|
});
|
|
13442
13856
|
}
|
|
13443
|
-
|
|
13444
|
-
|
|
13857
|
+
const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : void 0;
|
|
13858
|
+
const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath;
|
|
13859
|
+
const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit;
|
|
13860
|
+
try {
|
|
13861
|
+
const judgeProvider = await resolveJudgeProvider(target);
|
|
13862
|
+
const runCaseOptions = {
|
|
13863
|
+
evalCase,
|
|
13864
|
+
provider: primaryProvider,
|
|
13865
|
+
target,
|
|
13866
|
+
evaluators: evaluatorRegistry,
|
|
13867
|
+
maxRetries,
|
|
13868
|
+
agentTimeoutMs,
|
|
13869
|
+
cache,
|
|
13870
|
+
useCache,
|
|
13871
|
+
now,
|
|
13872
|
+
judgeProvider,
|
|
13873
|
+
targetResolver,
|
|
13874
|
+
availableTargets,
|
|
13875
|
+
evalRunId,
|
|
13876
|
+
keepWorkspaces,
|
|
13877
|
+
cleanupWorkspaces,
|
|
13878
|
+
sharedWorkspacePath: testWorkspacePath,
|
|
13879
|
+
sharedBaselineCommit: testBaselineCommit,
|
|
13880
|
+
suiteWorkspaceFile,
|
|
13881
|
+
streamCallbacks,
|
|
13882
|
+
typeRegistry,
|
|
13883
|
+
repoManager,
|
|
13884
|
+
evalDir
|
|
13885
|
+
};
|
|
13886
|
+
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
13887
|
+
if (totalBudgetUsd !== void 0) {
|
|
13888
|
+
let caseCost;
|
|
13889
|
+
if (result.trials && result.trials.length > 0) {
|
|
13890
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
13891
|
+
if (trialCostSum > 0) {
|
|
13892
|
+
caseCost = trialCostSum;
|
|
13893
|
+
}
|
|
13894
|
+
} else {
|
|
13895
|
+
caseCost = result.costUsd;
|
|
13896
|
+
}
|
|
13897
|
+
if (caseCost !== void 0) {
|
|
13898
|
+
cumulativeBudgetCost += caseCost;
|
|
13899
|
+
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
13900
|
+
budgetExhausted = true;
|
|
13901
|
+
}
|
|
13902
|
+
}
|
|
13903
|
+
}
|
|
13904
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
13905
|
+
failOnErrorTriggered = true;
|
|
13906
|
+
}
|
|
13907
|
+
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
13908
|
+
result = { ...result, beforeAllOutput };
|
|
13909
|
+
beforeAllOutputAttached = true;
|
|
13910
|
+
}
|
|
13911
|
+
if (onProgress) {
|
|
13912
|
+
await onProgress({
|
|
13913
|
+
workerId,
|
|
13914
|
+
testId: evalCase.id,
|
|
13915
|
+
status: result.error ? "failed" : "completed",
|
|
13916
|
+
startedAt: 0,
|
|
13917
|
+
// Not used for completed status
|
|
13918
|
+
completedAt: Date.now(),
|
|
13919
|
+
error: result.error
|
|
13920
|
+
});
|
|
13921
|
+
}
|
|
13922
|
+
if (onResult) {
|
|
13923
|
+
await onResult(result);
|
|
13924
|
+
}
|
|
13925
|
+
return result;
|
|
13926
|
+
} catch (error) {
|
|
13927
|
+
if (onProgress) {
|
|
13928
|
+
await onProgress({
|
|
13929
|
+
workerId,
|
|
13930
|
+
testId: evalCase.id,
|
|
13931
|
+
status: "failed",
|
|
13932
|
+
completedAt: Date.now(),
|
|
13933
|
+
error: error instanceof Error ? error.message : String(error)
|
|
13934
|
+
});
|
|
13935
|
+
}
|
|
13936
|
+
throw error;
|
|
13937
|
+
} finally {
|
|
13938
|
+
if (testPoolSlot) {
|
|
13939
|
+
availablePoolSlots.push(testPoolSlot);
|
|
13940
|
+
}
|
|
13445
13941
|
}
|
|
13446
|
-
|
|
13447
|
-
|
|
13448
|
-
|
|
13449
|
-
|
|
13450
|
-
|
|
13451
|
-
|
|
13452
|
-
|
|
13453
|
-
|
|
13454
|
-
|
|
13455
|
-
|
|
13942
|
+
})
|
|
13943
|
+
);
|
|
13944
|
+
const settled = await Promise.allSettled(promises);
|
|
13945
|
+
const results = [];
|
|
13946
|
+
for (let i = 0; i < settled.length; i++) {
|
|
13947
|
+
const outcome = settled[i];
|
|
13948
|
+
if (outcome.status === "fulfilled") {
|
|
13949
|
+
results.push(outcome.value);
|
|
13950
|
+
} else {
|
|
13951
|
+
const evalCase = filteredEvalCases[i];
|
|
13952
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
13953
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
13954
|
+
const errorResult = buildErrorResult(
|
|
13955
|
+
evalCase,
|
|
13956
|
+
target.name,
|
|
13957
|
+
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
13958
|
+
outcome.reason,
|
|
13959
|
+
promptInputs,
|
|
13960
|
+
primaryProvider,
|
|
13961
|
+
"agent",
|
|
13962
|
+
"provider_error"
|
|
13963
|
+
);
|
|
13964
|
+
results.push(errorResult);
|
|
13965
|
+
if (onResult) {
|
|
13966
|
+
await onResult(errorResult);
|
|
13456
13967
|
}
|
|
13457
|
-
throw error;
|
|
13458
13968
|
}
|
|
13459
|
-
}
|
|
13460
|
-
|
|
13461
|
-
|
|
13462
|
-
|
|
13463
|
-
|
|
13464
|
-
|
|
13465
|
-
|
|
13466
|
-
|
|
13467
|
-
|
|
13468
|
-
|
|
13469
|
-
|
|
13470
|
-
|
|
13471
|
-
|
|
13472
|
-
|
|
13473
|
-
|
|
13474
|
-
|
|
13475
|
-
|
|
13476
|
-
|
|
13477
|
-
|
|
13478
|
-
|
|
13479
|
-
|
|
13480
|
-
);
|
|
13481
|
-
results.push(errorResult);
|
|
13482
|
-
if (onResult) {
|
|
13483
|
-
await onResult(errorResult);
|
|
13969
|
+
}
|
|
13970
|
+
const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
|
|
13971
|
+
if (afterAllWorkspaces.length > 0 && suiteWorkspace?.after_all) {
|
|
13972
|
+
for (const wsPath of afterAllWorkspaces) {
|
|
13973
|
+
const scriptContext = {
|
|
13974
|
+
workspacePath: wsPath,
|
|
13975
|
+
testId: "__after_all__",
|
|
13976
|
+
evalRunId,
|
|
13977
|
+
evalDir
|
|
13978
|
+
};
|
|
13979
|
+
try {
|
|
13980
|
+
const afterAllOutput = await executeWorkspaceScript(
|
|
13981
|
+
suiteWorkspace.after_all,
|
|
13982
|
+
scriptContext,
|
|
13983
|
+
"warn"
|
|
13984
|
+
);
|
|
13985
|
+
if (afterAllOutput && results.length > 0 && wsPath === afterAllWorkspaces[0]) {
|
|
13986
|
+
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
13987
|
+
}
|
|
13988
|
+
} catch {
|
|
13989
|
+
}
|
|
13484
13990
|
}
|
|
13485
13991
|
}
|
|
13486
|
-
|
|
13487
|
-
|
|
13488
|
-
|
|
13489
|
-
|
|
13490
|
-
|
|
13491
|
-
|
|
13492
|
-
|
|
13493
|
-
|
|
13494
|
-
try {
|
|
13495
|
-
const afterAllOutput = await executeWorkspaceScript(
|
|
13496
|
-
suiteWorkspace.after_all,
|
|
13497
|
-
scriptContext,
|
|
13498
|
-
"warn"
|
|
13499
|
-
);
|
|
13500
|
-
if (afterAllOutput && results.length > 0) {
|
|
13501
|
-
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
13992
|
+
if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !userWorkspacePath) {
|
|
13993
|
+
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
13994
|
+
if (cleanupWorkspaces) {
|
|
13995
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13996
|
+
});
|
|
13997
|
+
} else if (!hasFailure && !keepWorkspaces) {
|
|
13998
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
13999
|
+
});
|
|
13502
14000
|
}
|
|
13503
|
-
} catch {
|
|
13504
14001
|
}
|
|
13505
|
-
}
|
|
13506
|
-
if (sharedWorkspacePath) {
|
|
13507
|
-
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
13508
14002
|
if (cleanupWorkspaces) {
|
|
13509
|
-
await
|
|
13510
|
-
});
|
|
13511
|
-
} else if (!hasFailure && !keepWorkspaces) {
|
|
13512
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
14003
|
+
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
13513
14004
|
});
|
|
13514
14005
|
}
|
|
14006
|
+
return results;
|
|
14007
|
+
} finally {
|
|
14008
|
+
if (poolManager) {
|
|
14009
|
+
if (poolSlot) {
|
|
14010
|
+
await poolManager.releaseSlot(poolSlot);
|
|
14011
|
+
}
|
|
14012
|
+
for (const slot of poolSlots) {
|
|
14013
|
+
if (slot !== poolSlot) {
|
|
14014
|
+
await poolManager.releaseSlot(slot).catch(() => {
|
|
14015
|
+
});
|
|
14016
|
+
}
|
|
14017
|
+
}
|
|
14018
|
+
}
|
|
13515
14019
|
}
|
|
13516
|
-
if (cleanupWorkspaces) {
|
|
13517
|
-
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
13518
|
-
});
|
|
13519
|
-
}
|
|
13520
|
-
return results;
|
|
13521
14020
|
}
|
|
13522
14021
|
async function runBatchEvaluation(options) {
|
|
13523
14022
|
const {
|
|
@@ -13734,7 +14233,7 @@ async function runEvalCase(options) {
|
|
|
13734
14233
|
);
|
|
13735
14234
|
}
|
|
13736
14235
|
if (caseWorkspaceFile && workspacePath) {
|
|
13737
|
-
const copiedFile =
|
|
14236
|
+
const copiedFile = path39.join(workspacePath, path39.basename(caseWorkspaceFile));
|
|
13738
14237
|
try {
|
|
13739
14238
|
await stat7(copiedFile);
|
|
13740
14239
|
caseWorkspaceFile = copiedFile;
|
|
@@ -13744,7 +14243,7 @@ async function runEvalCase(options) {
|
|
|
13744
14243
|
}
|
|
13745
14244
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
13746
14245
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
13747
|
-
await
|
|
14246
|
+
await mkdir13(workspacePath, { recursive: true });
|
|
13748
14247
|
}
|
|
13749
14248
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
13750
14249
|
const perCaseRepoManager = new RepoManager(void 0, setupDebug);
|
|
@@ -14344,7 +14843,7 @@ async function runEvaluatorList(options) {
|
|
|
14344
14843
|
fileChanges,
|
|
14345
14844
|
workspacePath
|
|
14346
14845
|
};
|
|
14347
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
14846
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path39.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
14348
14847
|
const dispatchContext = {
|
|
14349
14848
|
judgeProvider,
|
|
14350
14849
|
targetResolver,
|
|
@@ -14578,7 +15077,7 @@ function extractProviderError(response) {
|
|
|
14578
15077
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
14579
15078
|
}
|
|
14580
15079
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
14581
|
-
const hash =
|
|
15080
|
+
const hash = createHash3("sha256");
|
|
14582
15081
|
hash.update(provider.id);
|
|
14583
15082
|
hash.update(target.name);
|
|
14584
15083
|
hash.update(evalCase.id);
|
|
@@ -14646,8 +15145,8 @@ function computeWeightedMean(entries) {
|
|
|
14646
15145
|
}
|
|
14647
15146
|
|
|
14648
15147
|
// src/evaluation/evaluate.ts
|
|
14649
|
-
import { existsSync as
|
|
14650
|
-
import
|
|
15148
|
+
import { existsSync as existsSync4 } from "node:fs";
|
|
15149
|
+
import path40 from "node:path";
|
|
14651
15150
|
async function evaluate(config) {
|
|
14652
15151
|
const startTime = Date.now();
|
|
14653
15152
|
if (config.tests && config.specFile) {
|
|
@@ -14669,13 +15168,13 @@ async function evaluate(config) {
|
|
|
14669
15168
|
let evalCases;
|
|
14670
15169
|
let testFilePath;
|
|
14671
15170
|
if (config.specFile) {
|
|
14672
|
-
testFilePath =
|
|
15171
|
+
testFilePath = path40.resolve(config.specFile);
|
|
14673
15172
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
14674
15173
|
verbose: config.verbose,
|
|
14675
15174
|
filter: config.filter
|
|
14676
15175
|
});
|
|
14677
15176
|
} else {
|
|
14678
|
-
testFilePath =
|
|
15177
|
+
testFilePath = path40.join(process.cwd(), "__programmatic__.yaml");
|
|
14679
15178
|
evalCases = (config.tests ?? []).map((test) => {
|
|
14680
15179
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
14681
15180
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -14761,11 +15260,11 @@ function computeSummary(results, durationMs) {
|
|
|
14761
15260
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
14762
15261
|
async function discoverDefaultTarget(repoRoot) {
|
|
14763
15262
|
const cwd = process.cwd();
|
|
14764
|
-
const chain = buildDirectoryChain(
|
|
15263
|
+
const chain = buildDirectoryChain(path40.join(cwd, "_placeholder"), repoRoot);
|
|
14765
15264
|
for (const dir of chain) {
|
|
14766
15265
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
14767
|
-
const targetsPath =
|
|
14768
|
-
if (!
|
|
15266
|
+
const targetsPath = path40.join(dir, candidate);
|
|
15267
|
+
if (!existsSync4(targetsPath)) continue;
|
|
14769
15268
|
try {
|
|
14770
15269
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
14771
15270
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -14779,11 +15278,11 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
14779
15278
|
async function loadEnvHierarchy(repoRoot) {
|
|
14780
15279
|
const { readFileSync: readFileSync2 } = await import("node:fs");
|
|
14781
15280
|
const cwd = process.cwd();
|
|
14782
|
-
const chain = buildDirectoryChain(
|
|
15281
|
+
const chain = buildDirectoryChain(path40.join(cwd, "_placeholder"), repoRoot);
|
|
14783
15282
|
const envFiles = [];
|
|
14784
15283
|
for (const dir of chain) {
|
|
14785
|
-
const envPath =
|
|
14786
|
-
if (
|
|
15284
|
+
const envPath = path40.join(dir, ".env");
|
|
15285
|
+
if (existsSync4(envPath)) envFiles.push(envPath);
|
|
14787
15286
|
}
|
|
14788
15287
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
14789
15288
|
try {
|
|
@@ -14861,12 +15360,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
14861
15360
|
".agentv/config.js"
|
|
14862
15361
|
];
|
|
14863
15362
|
async function loadTsConfig(projectRoot) {
|
|
14864
|
-
const { existsSync:
|
|
15363
|
+
const { existsSync: existsSync5 } = await import("node:fs");
|
|
14865
15364
|
const { pathToFileURL } = await import("node:url");
|
|
14866
15365
|
const { join: join2 } = await import("node:path");
|
|
14867
15366
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
14868
15367
|
const filePath = join2(projectRoot, fileName);
|
|
14869
|
-
if (!
|
|
15368
|
+
if (!existsSync5(filePath)) {
|
|
14870
15369
|
continue;
|
|
14871
15370
|
}
|
|
14872
15371
|
try {
|
|
@@ -14963,8 +15462,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
14963
15462
|
}
|
|
14964
15463
|
|
|
14965
15464
|
// src/evaluation/cache/response-cache.ts
|
|
14966
|
-
import { mkdir as
|
|
14967
|
-
import
|
|
15465
|
+
import { mkdir as mkdir14, readFile as readFile12, writeFile as writeFile9 } from "node:fs/promises";
|
|
15466
|
+
import path41 from "node:path";
|
|
14968
15467
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
14969
15468
|
var ResponseCache = class {
|
|
14970
15469
|
cachePath;
|
|
@@ -14974,7 +15473,7 @@ var ResponseCache = class {
|
|
|
14974
15473
|
async get(key) {
|
|
14975
15474
|
const filePath = this.keyToPath(key);
|
|
14976
15475
|
try {
|
|
14977
|
-
const data = await
|
|
15476
|
+
const data = await readFile12(filePath, "utf8");
|
|
14978
15477
|
return JSON.parse(data);
|
|
14979
15478
|
} catch {
|
|
14980
15479
|
return void 0;
|
|
@@ -14982,13 +15481,13 @@ var ResponseCache = class {
|
|
|
14982
15481
|
}
|
|
14983
15482
|
async set(key, value) {
|
|
14984
15483
|
const filePath = this.keyToPath(key);
|
|
14985
|
-
const dir =
|
|
14986
|
-
await
|
|
14987
|
-
await
|
|
15484
|
+
const dir = path41.dirname(filePath);
|
|
15485
|
+
await mkdir14(dir, { recursive: true });
|
|
15486
|
+
await writeFile9(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
14988
15487
|
}
|
|
14989
15488
|
keyToPath(key) {
|
|
14990
15489
|
const prefix = key.slice(0, 2);
|
|
14991
|
-
return
|
|
15490
|
+
return path41.join(this.cachePath, prefix, `${key}.json`);
|
|
14992
15491
|
}
|
|
14993
15492
|
};
|
|
14994
15493
|
function shouldEnableCache(params) {
|
|
@@ -15470,6 +15969,7 @@ export {
|
|
|
15470
15969
|
TokenUsageEvaluator,
|
|
15471
15970
|
ToolTrajectoryEvaluator,
|
|
15472
15971
|
WorkspaceCreationError,
|
|
15972
|
+
WorkspacePoolManager,
|
|
15473
15973
|
assembleLlmJudgePrompt,
|
|
15474
15974
|
avgToolDurationMs,
|
|
15475
15975
|
buildDirectoryChain,
|
|
@@ -15484,6 +15984,7 @@ export {
|
|
|
15484
15984
|
cleanupEvalWorkspaces,
|
|
15485
15985
|
cleanupWorkspace,
|
|
15486
15986
|
computeTraceSummary,
|
|
15987
|
+
computeWorkspaceFingerprint,
|
|
15487
15988
|
consumeClaudeLogEntries,
|
|
15488
15989
|
consumeCodexLogEntries,
|
|
15489
15990
|
consumeCopilotCliLogEntries,
|
|
@@ -15521,6 +16022,7 @@ export {
|
|
|
15521
16022
|
getSubagentsRoot,
|
|
15522
16023
|
getTraceStateRoot,
|
|
15523
16024
|
getWorkspacePath,
|
|
16025
|
+
getWorkspacePoolRoot,
|
|
15524
16026
|
getWorkspacesRoot,
|
|
15525
16027
|
initializeBaseline,
|
|
15526
16028
|
isEvaluatorKind,
|