@agentv/core 2.14.3 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1009 -504
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +85 -1
- package/dist/index.d.ts +85 -1
- package/dist/index.js +950 -448
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -1244,12 +1244,12 @@ function serializeAttributeValue(value) {
|
|
|
1244
1244
|
if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
|
|
1245
1245
|
return { stringValue: String(value) };
|
|
1246
1246
|
}
|
|
1247
|
-
var
|
|
1247
|
+
var import_promises32, import_node_path45, OtlpJsonFileExporter;
|
|
1248
1248
|
var init_otlp_json_file_exporter = __esm({
|
|
1249
1249
|
"src/observability/otlp-json-file-exporter.ts"() {
|
|
1250
1250
|
"use strict";
|
|
1251
|
-
|
|
1252
|
-
|
|
1251
|
+
import_promises32 = require("fs/promises");
|
|
1252
|
+
import_node_path45 = require("path");
|
|
1253
1253
|
OtlpJsonFileExporter = class {
|
|
1254
1254
|
// biome-ignore lint/suspicious/noExplicitAny: serialized span data
|
|
1255
1255
|
spans = [];
|
|
@@ -1288,7 +1288,7 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1288
1288
|
}
|
|
1289
1289
|
async flush() {
|
|
1290
1290
|
if (this.spans.length === 0) return;
|
|
1291
|
-
await (0,
|
|
1291
|
+
await (0, import_promises32.mkdir)((0, import_node_path45.dirname)(this.filePath), { recursive: true });
|
|
1292
1292
|
const otlpJson = {
|
|
1293
1293
|
resourceSpans: [
|
|
1294
1294
|
{
|
|
@@ -1302,8 +1302,8 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1302
1302
|
}
|
|
1303
1303
|
]
|
|
1304
1304
|
};
|
|
1305
|
-
const { writeFile:
|
|
1306
|
-
await
|
|
1305
|
+
const { writeFile: writeFile10 } = await import("fs/promises");
|
|
1306
|
+
await writeFile10(this.filePath, JSON.stringify(otlpJson, null, 2));
|
|
1307
1307
|
}
|
|
1308
1308
|
};
|
|
1309
1309
|
}
|
|
@@ -1319,13 +1319,13 @@ function hrTimeDiffMs(start, end) {
|
|
|
1319
1319
|
const diffNano = end[1] - start[1];
|
|
1320
1320
|
return Math.round(diffSec * 1e3 + diffNano / 1e6);
|
|
1321
1321
|
}
|
|
1322
|
-
var
|
|
1322
|
+
var import_node_fs14, import_promises33, import_node_path46, SimpleTraceFileExporter;
|
|
1323
1323
|
var init_simple_trace_file_exporter = __esm({
|
|
1324
1324
|
"src/observability/simple-trace-file-exporter.ts"() {
|
|
1325
1325
|
"use strict";
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1326
|
+
import_node_fs14 = require("fs");
|
|
1327
|
+
import_promises33 = require("fs/promises");
|
|
1328
|
+
import_node_path46 = require("path");
|
|
1329
1329
|
SimpleTraceFileExporter = class {
|
|
1330
1330
|
stream = null;
|
|
1331
1331
|
filePath;
|
|
@@ -1338,8 +1338,8 @@ var init_simple_trace_file_exporter = __esm({
|
|
|
1338
1338
|
async ensureStream() {
|
|
1339
1339
|
if (!this.streamReady) {
|
|
1340
1340
|
this.streamReady = (async () => {
|
|
1341
|
-
await (0,
|
|
1342
|
-
this.stream = (0,
|
|
1341
|
+
await (0, import_promises33.mkdir)((0, import_node_path46.dirname)(this.filePath), { recursive: true });
|
|
1342
|
+
this.stream = (0, import_node_fs14.createWriteStream)(this.filePath, { flags: "w" });
|
|
1343
1343
|
return this.stream;
|
|
1344
1344
|
})();
|
|
1345
1345
|
}
|
|
@@ -1457,6 +1457,7 @@ __export(index_exports, {
|
|
|
1457
1457
|
TokenUsageEvaluator: () => TokenUsageEvaluator,
|
|
1458
1458
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
1459
1459
|
WorkspaceCreationError: () => WorkspaceCreationError,
|
|
1460
|
+
WorkspacePoolManager: () => WorkspacePoolManager,
|
|
1460
1461
|
assembleLlmJudgePrompt: () => assembleLlmJudgePrompt,
|
|
1461
1462
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
1462
1463
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
@@ -1471,6 +1472,7 @@ __export(index_exports, {
|
|
|
1471
1472
|
cleanupEvalWorkspaces: () => cleanupEvalWorkspaces,
|
|
1472
1473
|
cleanupWorkspace: () => cleanupWorkspace,
|
|
1473
1474
|
computeTraceSummary: () => computeTraceSummary,
|
|
1475
|
+
computeWorkspaceFingerprint: () => computeWorkspaceFingerprint,
|
|
1474
1476
|
consumeClaudeLogEntries: () => consumeClaudeLogEntries,
|
|
1475
1477
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
1476
1478
|
consumeCopilotCliLogEntries: () => consumeCopilotCliLogEntries,
|
|
@@ -1508,6 +1510,7 @@ __export(index_exports, {
|
|
|
1508
1510
|
getSubagentsRoot: () => getSubagentsRoot,
|
|
1509
1511
|
getTraceStateRoot: () => getTraceStateRoot,
|
|
1510
1512
|
getWorkspacePath: () => getWorkspacePath,
|
|
1513
|
+
getWorkspacePoolRoot: () => getWorkspacePoolRoot,
|
|
1511
1514
|
getWorkspacesRoot: () => getWorkspacesRoot,
|
|
1512
1515
|
initializeBaseline: () => initializeBaseline,
|
|
1513
1516
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
@@ -2236,6 +2239,17 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
2236
2239
|
} else if (otelFile !== void 0) {
|
|
2237
2240
|
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
2238
2241
|
}
|
|
2242
|
+
if (typeof obj.pool_workspaces === "boolean") {
|
|
2243
|
+
result.pool_workspaces = obj.pool_workspaces;
|
|
2244
|
+
} else if (obj.pool_workspaces !== void 0) {
|
|
2245
|
+
logWarning(`Invalid execution.pool_workspaces in ${configPath}, expected boolean`);
|
|
2246
|
+
}
|
|
2247
|
+
const poolSlots = obj.pool_slots;
|
|
2248
|
+
if (typeof poolSlots === "number" && Number.isInteger(poolSlots) && poolSlots >= 1 && poolSlots <= 50) {
|
|
2249
|
+
result.pool_slots = poolSlots;
|
|
2250
|
+
} else if (poolSlots !== void 0) {
|
|
2251
|
+
logWarning(`Invalid execution.pool_slots in ${configPath}, expected integer 1-50`);
|
|
2252
|
+
}
|
|
2239
2253
|
return Object.keys(result).length > 0 ? result : void 0;
|
|
2240
2254
|
}
|
|
2241
2255
|
function logWarning(message) {
|
|
@@ -3677,6 +3691,7 @@ async function processMessages(options) {
|
|
|
3677
3691
|
repoRootPath,
|
|
3678
3692
|
guidelinePatterns,
|
|
3679
3693
|
guidelinePaths,
|
|
3694
|
+
treatFileSegmentsAsGuidelines,
|
|
3680
3695
|
textParts,
|
|
3681
3696
|
messageType,
|
|
3682
3697
|
verbose
|
|
@@ -3724,16 +3739,20 @@ async function processMessages(options) {
|
|
|
3724
3739
|
}
|
|
3725
3740
|
try {
|
|
3726
3741
|
const fileContent = (await (0, import_promises5.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
3727
|
-
|
|
3728
|
-
|
|
3729
|
-
|
|
3730
|
-
|
|
3731
|
-
|
|
3732
|
-
|
|
3733
|
-
|
|
3734
|
-
|
|
3735
|
-
|
|
3742
|
+
const classifyAsGuideline = shouldTreatAsGuideline({
|
|
3743
|
+
messageType,
|
|
3744
|
+
resolvedPath,
|
|
3745
|
+
repoRootPath,
|
|
3746
|
+
guidelinePatterns,
|
|
3747
|
+
treatFileSegmentsAsGuidelines
|
|
3748
|
+
});
|
|
3749
|
+
if (classifyAsGuideline && guidelinePaths) {
|
|
3750
|
+
guidelinePaths.push(import_node_path5.default.resolve(resolvedPath));
|
|
3751
|
+
if (verbose) {
|
|
3752
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
3753
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
3736
3754
|
}
|
|
3755
|
+
continue;
|
|
3737
3756
|
}
|
|
3738
3757
|
segments.push({
|
|
3739
3758
|
type: "file",
|
|
@@ -3762,6 +3781,26 @@ async function processMessages(options) {
|
|
|
3762
3781
|
}
|
|
3763
3782
|
return segments;
|
|
3764
3783
|
}
|
|
3784
|
+
function shouldTreatAsGuideline(options) {
|
|
3785
|
+
const {
|
|
3786
|
+
messageType,
|
|
3787
|
+
resolvedPath,
|
|
3788
|
+
repoRootPath,
|
|
3789
|
+
guidelinePatterns,
|
|
3790
|
+
treatFileSegmentsAsGuidelines
|
|
3791
|
+
} = options;
|
|
3792
|
+
if (messageType !== "input") {
|
|
3793
|
+
return false;
|
|
3794
|
+
}
|
|
3795
|
+
if (treatFileSegmentsAsGuidelines) {
|
|
3796
|
+
return true;
|
|
3797
|
+
}
|
|
3798
|
+
if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
3799
|
+
return false;
|
|
3800
|
+
}
|
|
3801
|
+
const relativeToRepo = import_node_path5.default.relative(repoRootPath, resolvedPath);
|
|
3802
|
+
return isGuidelineFile(relativeToRepo, guidelinePatterns);
|
|
3803
|
+
}
|
|
3765
3804
|
function asString3(value) {
|
|
3766
3805
|
return typeof value === "string" ? value : void 0;
|
|
3767
3806
|
}
|
|
@@ -4100,6 +4139,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4100
4139
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
4101
4140
|
console.log(` - ${guidelinePath}`);
|
|
4102
4141
|
}
|
|
4142
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
4143
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
4103
4144
|
} else {
|
|
4104
4145
|
console.log(" No guidelines found");
|
|
4105
4146
|
}
|
|
@@ -4469,7 +4510,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4469
4510
|
} else {
|
|
4470
4511
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
4471
4512
|
}
|
|
4472
|
-
const suiteWorkspace =
|
|
4513
|
+
const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
|
|
4473
4514
|
const suiteInputMessages = expandInputShorthand(suite.input);
|
|
4474
4515
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
4475
4516
|
const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
|
|
@@ -4505,12 +4546,24 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4505
4546
|
}
|
|
4506
4547
|
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
4507
4548
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
4508
|
-
const
|
|
4549
|
+
const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
|
|
4550
|
+
const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
|
|
4509
4551
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
4510
4552
|
const guidelinePaths = [];
|
|
4511
4553
|
const inputTextParts = [];
|
|
4512
|
-
const
|
|
4513
|
-
messages:
|
|
4554
|
+
const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
|
|
4555
|
+
messages: effectiveSuiteInputMessages,
|
|
4556
|
+
searchRoots,
|
|
4557
|
+
repoRootPath,
|
|
4558
|
+
guidelinePatterns,
|
|
4559
|
+
guidelinePaths,
|
|
4560
|
+
treatFileSegmentsAsGuidelines: true,
|
|
4561
|
+
textParts: inputTextParts,
|
|
4562
|
+
messageType: "input",
|
|
4563
|
+
verbose
|
|
4564
|
+
}) : [];
|
|
4565
|
+
const testInputSegments = await processMessages({
|
|
4566
|
+
messages: testInputMessages,
|
|
4514
4567
|
searchRoots,
|
|
4515
4568
|
repoRootPath,
|
|
4516
4569
|
guidelinePatterns,
|
|
@@ -4519,6 +4572,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4519
4572
|
messageType: "input",
|
|
4520
4573
|
verbose
|
|
4521
4574
|
});
|
|
4575
|
+
const inputSegments = [...suiteInputSegments, ...testInputSegments];
|
|
4522
4576
|
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
4523
4577
|
messages: expectedMessages,
|
|
4524
4578
|
searchRoots,
|
|
@@ -4566,7 +4620,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4566
4620
|
...guidelinePaths.map((guidelinePath) => import_node_path8.default.resolve(guidelinePath)),
|
|
4567
4621
|
...userFilePaths
|
|
4568
4622
|
];
|
|
4569
|
-
const caseWorkspace =
|
|
4623
|
+
const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
|
|
4570
4624
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
4571
4625
|
const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
|
|
4572
4626
|
const caseTargets = extractTargetsFromTestCase(evalcase);
|
|
@@ -4597,6 +4651,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4597
4651
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
4598
4652
|
console.log(` - ${guidelinePath}`);
|
|
4599
4653
|
}
|
|
4654
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
4655
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
4600
4656
|
} else {
|
|
4601
4657
|
console.log(" No guidelines found");
|
|
4602
4658
|
}
|
|
@@ -4696,6 +4752,26 @@ function parseResetConfig(raw) {
|
|
|
4696
4752
|
...afterEach !== void 0 && { after_each: afterEach }
|
|
4697
4753
|
};
|
|
4698
4754
|
}
|
|
4755
|
+
async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
4756
|
+
if (typeof raw === "string") {
|
|
4757
|
+
const workspaceFilePath = import_node_path8.default.resolve(evalFileDir, raw);
|
|
4758
|
+
let content;
|
|
4759
|
+
try {
|
|
4760
|
+
content = await (0, import_promises8.readFile)(workspaceFilePath, "utf8");
|
|
4761
|
+
} catch {
|
|
4762
|
+
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
4763
|
+
}
|
|
4764
|
+
const parsed = (0, import_yaml4.parse)(content);
|
|
4765
|
+
if (!isJsonObject(parsed)) {
|
|
4766
|
+
throw new Error(
|
|
4767
|
+
`Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
|
|
4768
|
+
);
|
|
4769
|
+
}
|
|
4770
|
+
const workspaceFileDir = import_node_path8.default.dirname(workspaceFilePath);
|
|
4771
|
+
return parseWorkspaceConfig(parsed, workspaceFileDir);
|
|
4772
|
+
}
|
|
4773
|
+
return parseWorkspaceConfig(raw, evalFileDir);
|
|
4774
|
+
}
|
|
4699
4775
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
4700
4776
|
if (!isJsonObject(raw)) return void 0;
|
|
4701
4777
|
const obj = raw;
|
|
@@ -9493,8 +9569,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
9493
9569
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
9494
9570
|
if (!parseResult.success) {
|
|
9495
9571
|
const firstError = parseResult.error.errors[0];
|
|
9496
|
-
const
|
|
9497
|
-
const prefix =
|
|
9572
|
+
const path44 = firstError?.path.join(".") || "";
|
|
9573
|
+
const prefix = path44 ? `${target.name} ${path44}: ` : `${target.name}: `;
|
|
9498
9574
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
9499
9575
|
}
|
|
9500
9576
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -10010,6 +10086,9 @@ function getSubagentsRoot() {
|
|
|
10010
10086
|
function getTraceStateRoot() {
|
|
10011
10087
|
return import_node_path23.default.join(getAgentvHome(), "trace-state");
|
|
10012
10088
|
}
|
|
10089
|
+
function getWorkspacePoolRoot() {
|
|
10090
|
+
return import_node_path23.default.join(getAgentvHome(), "workspace-pool");
|
|
10091
|
+
}
|
|
10013
10092
|
|
|
10014
10093
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
10015
10094
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
@@ -10832,8 +10911,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10832
10911
|
|
|
10833
10912
|
**IMPORTANT**: Follow these exact steps:
|
|
10834
10913
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10835
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
10836
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10837
10914
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
10838
10915
|
\`\`\`
|
|
10839
10916
|
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
@@ -10850,8 +10927,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10850
10927
|
|
|
10851
10928
|
**IMPORTANT**: Follow these exact steps:
|
|
10852
10929
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10853
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
10854
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10855
10930
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
10856
10931
|
3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
|
|
10857
10932
|
`;
|
|
@@ -11464,16 +11539,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
11464
11539
|
});
|
|
11465
11540
|
}
|
|
11466
11541
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
11467
|
-
const { mkdir:
|
|
11542
|
+
const { mkdir: mkdir17, readFile: readFile14, rm: rm7, writeFile: writeFile10 } = await import("fs/promises");
|
|
11468
11543
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
11469
|
-
const
|
|
11544
|
+
const path44 = await import("path");
|
|
11470
11545
|
const { randomUUID: randomUUID8 } = await import("crypto");
|
|
11471
|
-
const dir =
|
|
11472
|
-
await
|
|
11473
|
-
const stdinPath =
|
|
11474
|
-
const stdoutPath =
|
|
11475
|
-
const stderrPath =
|
|
11476
|
-
await
|
|
11546
|
+
const dir = path44.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
11547
|
+
await mkdir17(dir, { recursive: true });
|
|
11548
|
+
const stdinPath = path44.join(dir, "stdin.txt");
|
|
11549
|
+
const stdoutPath = path44.join(dir, "stdout.txt");
|
|
11550
|
+
const stderrPath = path44.join(dir, "stderr.txt");
|
|
11551
|
+
await writeFile10(stdinPath, stdinPayload, "utf8");
|
|
11477
11552
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
11478
11553
|
const { spawn: spawn4 } = await import("child_process");
|
|
11479
11554
|
try {
|
|
@@ -11502,11 +11577,11 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
11502
11577
|
resolve(code ?? 0);
|
|
11503
11578
|
});
|
|
11504
11579
|
});
|
|
11505
|
-
const stdout = (await
|
|
11506
|
-
const stderr = (await
|
|
11580
|
+
const stdout = (await readFile14(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
11581
|
+
const stderr = (await readFile14(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
11507
11582
|
return { stdout, stderr, exitCode };
|
|
11508
11583
|
} finally {
|
|
11509
|
-
await
|
|
11584
|
+
await rm7(dir, { recursive: true, force: true });
|
|
11510
11585
|
}
|
|
11511
11586
|
}
|
|
11512
11587
|
|
|
@@ -11824,7 +11899,7 @@ var CodeEvaluator = class {
|
|
|
11824
11899
|
outputPath,
|
|
11825
11900
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
11826
11901
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
11827
|
-
(
|
|
11902
|
+
(path44) => !context2.evalCase.guideline_paths.includes(path44)
|
|
11828
11903
|
),
|
|
11829
11904
|
input: context2.evalCase.input,
|
|
11830
11905
|
trace: context2.trace ?? null,
|
|
@@ -12103,6 +12178,8 @@ ${context2.fileChanges}`;
|
|
|
12103
12178
|
};
|
|
12104
12179
|
} catch (e) {
|
|
12105
12180
|
const message = e instanceof Error ? e.message : String(e);
|
|
12181
|
+
const evalName = context2.evaluator?.name ?? "llm-judge";
|
|
12182
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
12106
12183
|
return {
|
|
12107
12184
|
score: 0,
|
|
12108
12185
|
verdict: "skip",
|
|
@@ -12131,24 +12208,39 @@ ${context2.fileChanges}`;
|
|
|
12131
12208
|
systemPrompt,
|
|
12132
12209
|
target: judgeProvider.targetName
|
|
12133
12210
|
};
|
|
12134
|
-
|
|
12135
|
-
|
|
12136
|
-
|
|
12137
|
-
|
|
12138
|
-
|
|
12139
|
-
|
|
12140
|
-
|
|
12141
|
-
|
|
12142
|
-
|
|
12143
|
-
|
|
12144
|
-
|
|
12145
|
-
|
|
12146
|
-
|
|
12147
|
-
|
|
12148
|
-
|
|
12149
|
-
|
|
12150
|
-
|
|
12151
|
-
|
|
12211
|
+
try {
|
|
12212
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
12213
|
+
context: context2,
|
|
12214
|
+
judgeProvider,
|
|
12215
|
+
systemPrompt,
|
|
12216
|
+
userPrompt: prompt,
|
|
12217
|
+
schema: rubricEvaluationSchema
|
|
12218
|
+
});
|
|
12219
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
12220
|
+
return {
|
|
12221
|
+
score,
|
|
12222
|
+
verdict,
|
|
12223
|
+
hits,
|
|
12224
|
+
misses,
|
|
12225
|
+
expectedAspectCount: rubrics.length,
|
|
12226
|
+
reasoning: data.overall_reasoning,
|
|
12227
|
+
evaluatorRawRequest,
|
|
12228
|
+
tokenUsage
|
|
12229
|
+
};
|
|
12230
|
+
} catch (e) {
|
|
12231
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
12232
|
+
const evalName = context2.evaluator?.name ?? "llm-judge";
|
|
12233
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
12234
|
+
return {
|
|
12235
|
+
score: 0,
|
|
12236
|
+
verdict: "skip",
|
|
12237
|
+
hits: [],
|
|
12238
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
12239
|
+
expectedAspectCount: rubrics.length,
|
|
12240
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
12241
|
+
evaluatorRawRequest
|
|
12242
|
+
};
|
|
12243
|
+
}
|
|
12152
12244
|
}
|
|
12153
12245
|
/**
|
|
12154
12246
|
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
@@ -12162,25 +12254,40 @@ ${context2.fileChanges}`;
|
|
|
12162
12254
|
systemPrompt,
|
|
12163
12255
|
target: judgeProvider.targetName
|
|
12164
12256
|
};
|
|
12165
|
-
|
|
12166
|
-
|
|
12167
|
-
|
|
12168
|
-
|
|
12169
|
-
|
|
12170
|
-
|
|
12171
|
-
|
|
12172
|
-
|
|
12173
|
-
|
|
12174
|
-
|
|
12175
|
-
|
|
12176
|
-
|
|
12177
|
-
|
|
12178
|
-
|
|
12179
|
-
|
|
12180
|
-
|
|
12181
|
-
|
|
12182
|
-
|
|
12183
|
-
|
|
12257
|
+
try {
|
|
12258
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
12259
|
+
context: context2,
|
|
12260
|
+
judgeProvider,
|
|
12261
|
+
systemPrompt,
|
|
12262
|
+
userPrompt: prompt,
|
|
12263
|
+
schema: scoreRangeEvaluationSchema
|
|
12264
|
+
});
|
|
12265
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
12266
|
+
return {
|
|
12267
|
+
score,
|
|
12268
|
+
verdict,
|
|
12269
|
+
hits,
|
|
12270
|
+
misses,
|
|
12271
|
+
expectedAspectCount: rubrics.length,
|
|
12272
|
+
reasoning: data.overall_reasoning,
|
|
12273
|
+
evaluatorRawRequest,
|
|
12274
|
+
details,
|
|
12275
|
+
tokenUsage
|
|
12276
|
+
};
|
|
12277
|
+
} catch (e) {
|
|
12278
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
12279
|
+
const evalName = context2.evaluator?.name ?? "llm-judge";
|
|
12280
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
12281
|
+
return {
|
|
12282
|
+
score: 0,
|
|
12283
|
+
verdict: "skip",
|
|
12284
|
+
hits: [],
|
|
12285
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
12286
|
+
expectedAspectCount: rubrics.length,
|
|
12287
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
12288
|
+
evaluatorRawRequest
|
|
12289
|
+
};
|
|
12290
|
+
}
|
|
12184
12291
|
}
|
|
12185
12292
|
/**
|
|
12186
12293
|
* Build prompt for score-range rubric evaluation.
|
|
@@ -12466,19 +12573,13 @@ var CompositeEvaluator = class {
|
|
|
12466
12573
|
runWeightedAverage(results, weights) {
|
|
12467
12574
|
let totalWeight = 0;
|
|
12468
12575
|
let weightedSum = 0;
|
|
12576
|
+
let evaluatedCount = 0;
|
|
12469
12577
|
const allHits = [];
|
|
12470
12578
|
const allMisses = [];
|
|
12471
12579
|
const reasoningParts = [];
|
|
12472
12580
|
const scores = [];
|
|
12473
12581
|
for (const member of results) {
|
|
12474
12582
|
const weight = weights?.[member.id] ?? 1;
|
|
12475
|
-
totalWeight += weight;
|
|
12476
|
-
weightedSum += member.result.score * weight;
|
|
12477
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12478
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12479
|
-
if (member.result.reasoning) {
|
|
12480
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12481
|
-
}
|
|
12482
12583
|
scores.push({
|
|
12483
12584
|
name: member.id,
|
|
12484
12585
|
type: member.type,
|
|
@@ -12493,6 +12594,32 @@ var CompositeEvaluator = class {
|
|
|
12493
12594
|
details: member.result.details,
|
|
12494
12595
|
tokenUsage: member.result.tokenUsage
|
|
12495
12596
|
});
|
|
12597
|
+
if (member.result.verdict === "skip") {
|
|
12598
|
+
continue;
|
|
12599
|
+
}
|
|
12600
|
+
evaluatedCount++;
|
|
12601
|
+
totalWeight += weight;
|
|
12602
|
+
weightedSum += member.result.score * weight;
|
|
12603
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12604
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12605
|
+
if (member.result.reasoning) {
|
|
12606
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12607
|
+
}
|
|
12608
|
+
}
|
|
12609
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
12610
|
+
return {
|
|
12611
|
+
score: 0,
|
|
12612
|
+
verdict: "skip",
|
|
12613
|
+
hits: [],
|
|
12614
|
+
misses: [],
|
|
12615
|
+
expectedAspectCount: 1,
|
|
12616
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
12617
|
+
evaluatorRawRequest: {
|
|
12618
|
+
aggregator: "weighted_average",
|
|
12619
|
+
...weights ? { weights } : {}
|
|
12620
|
+
},
|
|
12621
|
+
scores
|
|
12622
|
+
};
|
|
12496
12623
|
}
|
|
12497
12624
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
12498
12625
|
return {
|
|
@@ -12516,19 +12643,8 @@ var CompositeEvaluator = class {
|
|
|
12516
12643
|
const reasoningParts = [];
|
|
12517
12644
|
let passingCount = 0;
|
|
12518
12645
|
let borderlineCount = 0;
|
|
12646
|
+
let evaluatedCount = 0;
|
|
12519
12647
|
for (const member of results) {
|
|
12520
|
-
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
12521
|
-
if (isPassing) {
|
|
12522
|
-
passingCount++;
|
|
12523
|
-
if (member.result.verdict === "borderline") {
|
|
12524
|
-
borderlineCount++;
|
|
12525
|
-
}
|
|
12526
|
-
}
|
|
12527
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12528
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12529
|
-
if (member.result.reasoning) {
|
|
12530
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12531
|
-
}
|
|
12532
12648
|
scores.push({
|
|
12533
12649
|
name: member.id,
|
|
12534
12650
|
type: member.type,
|
|
@@ -12542,8 +12658,39 @@ var CompositeEvaluator = class {
|
|
|
12542
12658
|
details: member.result.details,
|
|
12543
12659
|
tokenUsage: member.result.tokenUsage
|
|
12544
12660
|
});
|
|
12661
|
+
if (member.result.verdict === "skip") {
|
|
12662
|
+
continue;
|
|
12663
|
+
}
|
|
12664
|
+
evaluatedCount++;
|
|
12665
|
+
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
12666
|
+
if (isPassing) {
|
|
12667
|
+
passingCount++;
|
|
12668
|
+
if (member.result.verdict === "borderline") {
|
|
12669
|
+
borderlineCount++;
|
|
12670
|
+
}
|
|
12671
|
+
}
|
|
12672
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12673
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12674
|
+
if (member.result.reasoning) {
|
|
12675
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12676
|
+
}
|
|
12677
|
+
}
|
|
12678
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
12679
|
+
return {
|
|
12680
|
+
score: 0,
|
|
12681
|
+
verdict: "skip",
|
|
12682
|
+
hits: [],
|
|
12683
|
+
misses: [],
|
|
12684
|
+
expectedAspectCount: 1,
|
|
12685
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
12686
|
+
evaluatorRawRequest: {
|
|
12687
|
+
aggregator: "threshold",
|
|
12688
|
+
threshold
|
|
12689
|
+
},
|
|
12690
|
+
scores
|
|
12691
|
+
};
|
|
12545
12692
|
}
|
|
12546
|
-
const totalCount =
|
|
12693
|
+
const totalCount = evaluatedCount;
|
|
12547
12694
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
12548
12695
|
const pass = score >= threshold;
|
|
12549
12696
|
if (pass && borderlineCount > 0) {
|
|
@@ -13051,115 +13198,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
13051
13198
|
* Evaluate a single field against the expected value.
|
|
13052
13199
|
*/
|
|
13053
13200
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
13054
|
-
const { path:
|
|
13055
|
-
const candidateValue = resolvePath(candidateData,
|
|
13056
|
-
const expectedValue = resolvePath(expectedData,
|
|
13201
|
+
const { path: path44, match, required = true, weight = 1 } = fieldConfig;
|
|
13202
|
+
const candidateValue = resolvePath(candidateData, path44);
|
|
13203
|
+
const expectedValue = resolvePath(expectedData, path44);
|
|
13057
13204
|
if (expectedValue === void 0) {
|
|
13058
13205
|
return {
|
|
13059
|
-
path:
|
|
13206
|
+
path: path44,
|
|
13060
13207
|
score: 1,
|
|
13061
13208
|
// No expected value means no comparison needed
|
|
13062
13209
|
weight,
|
|
13063
13210
|
hit: true,
|
|
13064
|
-
message: `${
|
|
13211
|
+
message: `${path44}: no expected value`
|
|
13065
13212
|
};
|
|
13066
13213
|
}
|
|
13067
13214
|
if (candidateValue === void 0) {
|
|
13068
13215
|
if (required) {
|
|
13069
13216
|
return {
|
|
13070
|
-
path:
|
|
13217
|
+
path: path44,
|
|
13071
13218
|
score: 0,
|
|
13072
13219
|
weight,
|
|
13073
13220
|
hit: false,
|
|
13074
|
-
message: `${
|
|
13221
|
+
message: `${path44} (required, missing)`
|
|
13075
13222
|
};
|
|
13076
13223
|
}
|
|
13077
13224
|
return {
|
|
13078
|
-
path:
|
|
13225
|
+
path: path44,
|
|
13079
13226
|
score: 1,
|
|
13080
13227
|
// Don't penalize missing optional fields
|
|
13081
13228
|
weight: 0,
|
|
13082
13229
|
// Zero weight means it won't affect the score
|
|
13083
13230
|
hit: true,
|
|
13084
|
-
message: `${
|
|
13231
|
+
message: `${path44}: optional field missing`
|
|
13085
13232
|
};
|
|
13086
13233
|
}
|
|
13087
13234
|
switch (match) {
|
|
13088
13235
|
case "exact":
|
|
13089
|
-
return this.compareExact(
|
|
13236
|
+
return this.compareExact(path44, candidateValue, expectedValue, weight);
|
|
13090
13237
|
case "numeric_tolerance":
|
|
13091
13238
|
return this.compareNumericTolerance(
|
|
13092
|
-
|
|
13239
|
+
path44,
|
|
13093
13240
|
candidateValue,
|
|
13094
13241
|
expectedValue,
|
|
13095
13242
|
fieldConfig,
|
|
13096
13243
|
weight
|
|
13097
13244
|
);
|
|
13098
13245
|
case "date":
|
|
13099
|
-
return this.compareDate(
|
|
13246
|
+
return this.compareDate(path44, candidateValue, expectedValue, fieldConfig, weight);
|
|
13100
13247
|
default:
|
|
13101
13248
|
return {
|
|
13102
|
-
path:
|
|
13249
|
+
path: path44,
|
|
13103
13250
|
score: 0,
|
|
13104
13251
|
weight,
|
|
13105
13252
|
hit: false,
|
|
13106
|
-
message: `${
|
|
13253
|
+
message: `${path44}: unknown match type "${match}"`
|
|
13107
13254
|
};
|
|
13108
13255
|
}
|
|
13109
13256
|
}
|
|
13110
13257
|
/**
|
|
13111
13258
|
* Exact equality comparison.
|
|
13112
13259
|
*/
|
|
13113
|
-
compareExact(
|
|
13260
|
+
compareExact(path44, candidateValue, expectedValue, weight) {
|
|
13114
13261
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
13115
13262
|
return {
|
|
13116
|
-
path:
|
|
13263
|
+
path: path44,
|
|
13117
13264
|
score: 1,
|
|
13118
13265
|
weight,
|
|
13119
13266
|
hit: true,
|
|
13120
|
-
message:
|
|
13267
|
+
message: path44
|
|
13121
13268
|
};
|
|
13122
13269
|
}
|
|
13123
13270
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
13124
13271
|
return {
|
|
13125
|
-
path:
|
|
13272
|
+
path: path44,
|
|
13126
13273
|
score: 0,
|
|
13127
13274
|
weight,
|
|
13128
13275
|
hit: false,
|
|
13129
|
-
message: `${
|
|
13276
|
+
message: `${path44} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
13130
13277
|
};
|
|
13131
13278
|
}
|
|
13132
13279
|
return {
|
|
13133
|
-
path:
|
|
13280
|
+
path: path44,
|
|
13134
13281
|
score: 0,
|
|
13135
13282
|
weight,
|
|
13136
13283
|
hit: false,
|
|
13137
|
-
message: `${
|
|
13284
|
+
message: `${path44} (value mismatch)`
|
|
13138
13285
|
};
|
|
13139
13286
|
}
|
|
13140
13287
|
/**
|
|
13141
13288
|
* Numeric comparison with absolute or relative tolerance.
|
|
13142
13289
|
*/
|
|
13143
|
-
compareNumericTolerance(
|
|
13290
|
+
compareNumericTolerance(path44, candidateValue, expectedValue, fieldConfig, weight) {
|
|
13144
13291
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
13145
13292
|
const candidateNum = toNumber2(candidateValue);
|
|
13146
13293
|
const expectedNum = toNumber2(expectedValue);
|
|
13147
13294
|
if (candidateNum === null || expectedNum === null) {
|
|
13148
13295
|
return {
|
|
13149
|
-
path:
|
|
13296
|
+
path: path44,
|
|
13150
13297
|
score: 0,
|
|
13151
13298
|
weight,
|
|
13152
13299
|
hit: false,
|
|
13153
|
-
message: `${
|
|
13300
|
+
message: `${path44} (non-numeric value)`
|
|
13154
13301
|
};
|
|
13155
13302
|
}
|
|
13156
13303
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
13157
13304
|
return {
|
|
13158
|
-
path:
|
|
13305
|
+
path: path44,
|
|
13159
13306
|
score: 0,
|
|
13160
13307
|
weight,
|
|
13161
13308
|
hit: false,
|
|
13162
|
-
message: `${
|
|
13309
|
+
message: `${path44} (invalid numeric value)`
|
|
13163
13310
|
};
|
|
13164
13311
|
}
|
|
13165
13312
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -13172,61 +13319,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
13172
13319
|
}
|
|
13173
13320
|
if (withinTolerance) {
|
|
13174
13321
|
return {
|
|
13175
|
-
path:
|
|
13322
|
+
path: path44,
|
|
13176
13323
|
score: 1,
|
|
13177
13324
|
weight,
|
|
13178
13325
|
hit: true,
|
|
13179
|
-
message: `${
|
|
13326
|
+
message: `${path44} (within tolerance: diff=${diff.toFixed(2)})`
|
|
13180
13327
|
};
|
|
13181
13328
|
}
|
|
13182
13329
|
return {
|
|
13183
|
-
path:
|
|
13330
|
+
path: path44,
|
|
13184
13331
|
score: 0,
|
|
13185
13332
|
weight,
|
|
13186
13333
|
hit: false,
|
|
13187
|
-
message: `${
|
|
13334
|
+
message: `${path44} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
13188
13335
|
};
|
|
13189
13336
|
}
|
|
13190
13337
|
/**
|
|
13191
13338
|
* Date comparison with format normalization.
|
|
13192
13339
|
*/
|
|
13193
|
-
compareDate(
|
|
13340
|
+
compareDate(path44, candidateValue, expectedValue, fieldConfig, weight) {
|
|
13194
13341
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
13195
13342
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
13196
13343
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
13197
13344
|
if (candidateDate === null) {
|
|
13198
13345
|
return {
|
|
13199
|
-
path:
|
|
13346
|
+
path: path44,
|
|
13200
13347
|
score: 0,
|
|
13201
13348
|
weight,
|
|
13202
13349
|
hit: false,
|
|
13203
|
-
message: `${
|
|
13350
|
+
message: `${path44} (unparseable candidate date)`
|
|
13204
13351
|
};
|
|
13205
13352
|
}
|
|
13206
13353
|
if (expectedDate === null) {
|
|
13207
13354
|
return {
|
|
13208
|
-
path:
|
|
13355
|
+
path: path44,
|
|
13209
13356
|
score: 0,
|
|
13210
13357
|
weight,
|
|
13211
13358
|
hit: false,
|
|
13212
|
-
message: `${
|
|
13359
|
+
message: `${path44} (unparseable expected date)`
|
|
13213
13360
|
};
|
|
13214
13361
|
}
|
|
13215
13362
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
13216
13363
|
return {
|
|
13217
|
-
path:
|
|
13364
|
+
path: path44,
|
|
13218
13365
|
score: 1,
|
|
13219
13366
|
weight,
|
|
13220
13367
|
hit: true,
|
|
13221
|
-
message:
|
|
13368
|
+
message: path44
|
|
13222
13369
|
};
|
|
13223
13370
|
}
|
|
13224
13371
|
return {
|
|
13225
|
-
path:
|
|
13372
|
+
path: path44,
|
|
13226
13373
|
score: 0,
|
|
13227
13374
|
weight,
|
|
13228
13375
|
hit: false,
|
|
13229
|
-
message: `${
|
|
13376
|
+
message: `${path44} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
13230
13377
|
};
|
|
13231
13378
|
}
|
|
13232
13379
|
/**
|
|
@@ -13267,11 +13414,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
13267
13414
|
};
|
|
13268
13415
|
}
|
|
13269
13416
|
};
|
|
13270
|
-
function resolvePath(obj,
|
|
13271
|
-
if (!
|
|
13417
|
+
function resolvePath(obj, path44) {
|
|
13418
|
+
if (!path44 || !obj) {
|
|
13272
13419
|
return void 0;
|
|
13273
13420
|
}
|
|
13274
|
-
const parts =
|
|
13421
|
+
const parts = path44.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
13275
13422
|
let current = obj;
|
|
13276
13423
|
for (const part of parts) {
|
|
13277
13424
|
if (current === null || current === void 0) {
|
|
@@ -14089,8 +14236,8 @@ var TokenUsageEvaluator = class {
|
|
|
14089
14236
|
};
|
|
14090
14237
|
|
|
14091
14238
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
14092
|
-
function getNestedValue(obj,
|
|
14093
|
-
const parts =
|
|
14239
|
+
function getNestedValue(obj, path44) {
|
|
14240
|
+
const parts = path44.split(".");
|
|
14094
14241
|
let current = obj;
|
|
14095
14242
|
for (const part of parts) {
|
|
14096
14243
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -14651,9 +14798,9 @@ function runEqualsAssertion(output, value) {
|
|
|
14651
14798
|
}
|
|
14652
14799
|
|
|
14653
14800
|
// src/evaluation/orchestrator.ts
|
|
14654
|
-
var
|
|
14655
|
-
var
|
|
14656
|
-
var
|
|
14801
|
+
var import_node_crypto10 = require("crypto");
|
|
14802
|
+
var import_promises30 = require("fs/promises");
|
|
14803
|
+
var import_node_path42 = __toESM(require("path"), 1);
|
|
14657
14804
|
var import_micromatch4 = __toESM(require("micromatch"), 1);
|
|
14658
14805
|
|
|
14659
14806
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -15523,7 +15670,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
15523
15670
|
}
|
|
15524
15671
|
}
|
|
15525
15672
|
|
|
15526
|
-
// src/evaluation/workspace/
|
|
15673
|
+
// src/evaluation/workspace/pool-manager.ts
|
|
15527
15674
|
var import_node_child_process7 = require("child_process");
|
|
15528
15675
|
var import_node_crypto8 = require("crypto");
|
|
15529
15676
|
var import_node_fs11 = require("fs");
|
|
@@ -15531,8 +15678,6 @@ var import_promises27 = require("fs/promises");
|
|
|
15531
15678
|
var import_node_path39 = __toESM(require("path"), 1);
|
|
15532
15679
|
var import_node_util5 = require("util");
|
|
15533
15680
|
var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
|
|
15534
|
-
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
15535
|
-
var LOCK_TIMEOUT_MS = 6e4;
|
|
15536
15681
|
function gitEnv() {
|
|
15537
15682
|
const env = { ...process.env };
|
|
15538
15683
|
for (const key of Object.keys(env)) {
|
|
@@ -15547,75 +15692,339 @@ function gitEnv() {
|
|
|
15547
15692
|
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
15548
15693
|
};
|
|
15549
15694
|
}
|
|
15550
|
-
function cacheKey(source) {
|
|
15551
|
-
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
15552
|
-
return (0, import_node_crypto8.createHash)("sha256").update(raw).digest("hex");
|
|
15553
|
-
}
|
|
15554
|
-
function getSourceUrl(source) {
|
|
15555
|
-
return source.type === "git" ? source.url : source.path;
|
|
15556
|
-
}
|
|
15557
15695
|
async function git(args, opts) {
|
|
15558
15696
|
const { stdout } = await execFileAsync("git", args, {
|
|
15559
15697
|
cwd: opts?.cwd,
|
|
15560
|
-
timeout: opts?.timeout ??
|
|
15698
|
+
timeout: opts?.timeout ?? 3e5,
|
|
15561
15699
|
env: gitEnv(),
|
|
15562
15700
|
maxBuffer: 50 * 1024 * 1024
|
|
15563
|
-
// 50MB
|
|
15564
15701
|
});
|
|
15565
15702
|
return stdout.trim();
|
|
15566
15703
|
}
|
|
15567
|
-
|
|
15568
|
-
const
|
|
15569
|
-
|
|
15570
|
-
|
|
15571
|
-
|
|
15572
|
-
|
|
15573
|
-
|
|
15574
|
-
|
|
15575
|
-
|
|
15704
|
+
function normalizeRepoForFingerprint(repo) {
|
|
15705
|
+
const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
|
|
15706
|
+
const result = {
|
|
15707
|
+
path: repo.path,
|
|
15708
|
+
source,
|
|
15709
|
+
ref: repo.checkout?.ref ?? "HEAD"
|
|
15710
|
+
};
|
|
15711
|
+
if (repo.clone?.depth !== void 0) {
|
|
15712
|
+
result.depth = repo.clone.depth;
|
|
15713
|
+
}
|
|
15714
|
+
if (repo.clone?.filter !== void 0) {
|
|
15715
|
+
result.filter = repo.clone.filter;
|
|
15716
|
+
}
|
|
15717
|
+
if (repo.clone?.sparse?.length) {
|
|
15718
|
+
result.sparse = [...repo.clone.sparse].sort();
|
|
15719
|
+
}
|
|
15720
|
+
return result;
|
|
15721
|
+
}
|
|
15722
|
+
function computeWorkspaceFingerprint(templatePath, repos) {
|
|
15723
|
+
const canonical = {
|
|
15724
|
+
templatePath: templatePath ?? null,
|
|
15725
|
+
repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
|
|
15726
|
+
};
|
|
15727
|
+
return (0, import_node_crypto8.createHash)("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
15728
|
+
}
|
|
15729
|
+
async function copyDirectoryRecursive2(src, dest, skipDirs) {
|
|
15730
|
+
await (0, import_promises27.mkdir)(dest, { recursive: true });
|
|
15731
|
+
const entries = await (0, import_promises27.readdir)(src, { withFileTypes: true });
|
|
15732
|
+
for (const entry of entries) {
|
|
15733
|
+
const srcPath = import_node_path39.default.join(src, entry.name);
|
|
15734
|
+
const destPath = import_node_path39.default.join(dest, entry.name);
|
|
15735
|
+
if (entry.name === ".git") {
|
|
15736
|
+
continue;
|
|
15737
|
+
}
|
|
15738
|
+
if (entry.isDirectory()) {
|
|
15739
|
+
if (skipDirs?.has(entry.name)) {
|
|
15576
15740
|
continue;
|
|
15577
15741
|
}
|
|
15578
|
-
|
|
15742
|
+
await copyDirectoryRecursive2(srcPath, destPath, skipDirs);
|
|
15743
|
+
} else {
|
|
15744
|
+
await (0, import_promises27.cp)(srcPath, destPath, { preserveTimestamps: true, force: true });
|
|
15579
15745
|
}
|
|
15580
15746
|
}
|
|
15581
|
-
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
15582
|
-
}
|
|
15583
|
-
async function releaseLock(lockPath) {
|
|
15584
|
-
try {
|
|
15585
|
-
await (0, import_promises27.unlink)(lockPath);
|
|
15586
|
-
} catch {
|
|
15587
|
-
}
|
|
15588
15747
|
}
|
|
15589
|
-
var
|
|
15590
|
-
|
|
15591
|
-
|
|
15592
|
-
|
|
15593
|
-
this.cacheDir = cacheDir ?? getGitCacheRoot();
|
|
15594
|
-
this.verbose = verbose;
|
|
15748
|
+
var WorkspacePoolManager = class {
|
|
15749
|
+
poolRoot;
|
|
15750
|
+
constructor(poolRoot) {
|
|
15751
|
+
this.poolRoot = poolRoot ?? getWorkspacePoolRoot();
|
|
15595
15752
|
}
|
|
15596
|
-
|
|
15597
|
-
|
|
15598
|
-
|
|
15599
|
-
|
|
15600
|
-
|
|
15753
|
+
/**
|
|
15754
|
+
* Acquire a workspace slot from the pool.
|
|
15755
|
+
*
|
|
15756
|
+
* 1. Compute fingerprint from template + repos
|
|
15757
|
+
* 2. Check drift (compare stored metadata.json fingerprint vs computed)
|
|
15758
|
+
* 3. If drift: warn, remove all slots, rematerialize
|
|
15759
|
+
* 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
|
|
15760
|
+
* 5. If slot exists: reset repos, re-copy template files (skip repo directories)
|
|
15761
|
+
* 6. If new slot: copy template, materialize all repos, write metadata.json
|
|
15762
|
+
* 7. Return the slot (with path, index, isExisting)
|
|
15763
|
+
*/
|
|
15764
|
+
async acquireWorkspace(options) {
|
|
15765
|
+
const { templatePath, repos, maxSlots, repoManager } = options;
|
|
15766
|
+
const fingerprint = computeWorkspaceFingerprint(templatePath, repos);
|
|
15767
|
+
const poolDir = import_node_path39.default.join(this.poolRoot, fingerprint);
|
|
15768
|
+
await (0, import_promises27.mkdir)(poolDir, { recursive: true });
|
|
15769
|
+
const drifted = await this.checkDrift(poolDir, fingerprint);
|
|
15770
|
+
if (drifted) {
|
|
15771
|
+
console.warn(
|
|
15772
|
+
`[workspace-pool] Drift detected for fingerprint ${fingerprint.slice(0, 12)}... Removing stale slots.`
|
|
15601
15773
|
);
|
|
15774
|
+
await this.removeAllSlots(poolDir);
|
|
15602
15775
|
}
|
|
15603
|
-
|
|
15604
|
-
const
|
|
15605
|
-
|
|
15606
|
-
|
|
15607
|
-
|
|
15608
|
-
|
|
15776
|
+
for (let i = 0; i < maxSlots; i++) {
|
|
15777
|
+
const slotPath = import_node_path39.default.join(poolDir, `slot-${i}`);
|
|
15778
|
+
const lockPath = `${slotPath}.lock`;
|
|
15779
|
+
const locked = await this.tryLock(lockPath);
|
|
15780
|
+
if (!locked) {
|
|
15781
|
+
continue;
|
|
15609
15782
|
}
|
|
15610
|
-
|
|
15611
|
-
|
|
15612
|
-
|
|
15613
|
-
|
|
15614
|
-
|
|
15615
|
-
|
|
15616
|
-
|
|
15783
|
+
const slotExists = (0, import_node_fs11.existsSync)(slotPath);
|
|
15784
|
+
if (slotExists) {
|
|
15785
|
+
await this.resetSlot(slotPath, templatePath, repos);
|
|
15786
|
+
return {
|
|
15787
|
+
index: i,
|
|
15788
|
+
path: slotPath,
|
|
15789
|
+
isExisting: true,
|
|
15790
|
+
lockPath,
|
|
15791
|
+
fingerprint,
|
|
15792
|
+
poolDir
|
|
15793
|
+
};
|
|
15617
15794
|
}
|
|
15618
|
-
|
|
15795
|
+
await (0, import_promises27.mkdir)(slotPath, { recursive: true });
|
|
15796
|
+
if (templatePath) {
|
|
15797
|
+
await copyDirectoryRecursive2(templatePath, slotPath);
|
|
15798
|
+
}
|
|
15799
|
+
if (repos.length > 0) {
|
|
15800
|
+
await repoManager.materializeAll(repos, slotPath);
|
|
15801
|
+
}
|
|
15802
|
+
await this.writeMetadata(poolDir, fingerprint, templatePath ?? null, repos);
|
|
15803
|
+
return {
|
|
15804
|
+
index: i,
|
|
15805
|
+
path: slotPath,
|
|
15806
|
+
isExisting: false,
|
|
15807
|
+
lockPath,
|
|
15808
|
+
fingerprint,
|
|
15809
|
+
poolDir
|
|
15810
|
+
};
|
|
15811
|
+
}
|
|
15812
|
+
throw new Error(
|
|
15813
|
+
`All ${maxSlots} pool slots are locked for fingerprint ${fingerprint.slice(0, 12)}...`
|
|
15814
|
+
);
|
|
15815
|
+
}
|
|
15816
|
+
/** Remove lock file to release a slot. */
|
|
15817
|
+
async releaseSlot(slot) {
|
|
15818
|
+
try {
|
|
15819
|
+
await (0, import_promises27.unlink)(slot.lockPath);
|
|
15820
|
+
} catch {
|
|
15821
|
+
}
|
|
15822
|
+
}
|
|
15823
|
+
/**
|
|
15824
|
+
* Try to acquire a PID-based lock file.
|
|
15825
|
+
* On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
|
|
15826
|
+
* Returns true if lock acquired, false if slot is actively locked.
|
|
15827
|
+
* Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
|
|
15828
|
+
*/
|
|
15829
|
+
async tryLock(lockPath) {
|
|
15830
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
15831
|
+
try {
|
|
15832
|
+
await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
|
|
15833
|
+
return true;
|
|
15834
|
+
} catch (err) {
|
|
15835
|
+
if (err.code !== "EEXIST") {
|
|
15836
|
+
throw err;
|
|
15837
|
+
}
|
|
15838
|
+
try {
|
|
15839
|
+
const pidStr = await (0, import_promises27.readFile)(lockPath, "utf-8");
|
|
15840
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
15841
|
+
if (!Number.isNaN(pid)) {
|
|
15842
|
+
try {
|
|
15843
|
+
process.kill(pid, 0);
|
|
15844
|
+
return false;
|
|
15845
|
+
} catch {
|
|
15846
|
+
await (0, import_promises27.unlink)(lockPath).catch(() => {
|
|
15847
|
+
});
|
|
15848
|
+
continue;
|
|
15849
|
+
}
|
|
15850
|
+
}
|
|
15851
|
+
} catch {
|
|
15852
|
+
}
|
|
15853
|
+
return false;
|
|
15854
|
+
}
|
|
15855
|
+
}
|
|
15856
|
+
return false;
|
|
15857
|
+
}
|
|
15858
|
+
/**
|
|
15859
|
+
* Check if the stored fingerprint in metadata.json differs from the computed one.
|
|
15860
|
+
* Returns true if drifted, false otherwise.
|
|
15861
|
+
* Returns false (no drift) if metadata.json doesn't exist (first use).
|
|
15862
|
+
*/
|
|
15863
|
+
async checkDrift(poolDir, fingerprint) {
|
|
15864
|
+
const metadataPath = import_node_path39.default.join(poolDir, "metadata.json");
|
|
15865
|
+
try {
|
|
15866
|
+
const raw = await (0, import_promises27.readFile)(metadataPath, "utf-8");
|
|
15867
|
+
const metadata = JSON.parse(raw);
|
|
15868
|
+
return metadata.fingerprint !== fingerprint;
|
|
15869
|
+
} catch {
|
|
15870
|
+
return false;
|
|
15871
|
+
}
|
|
15872
|
+
}
|
|
15873
|
+
/** Write metadata.json with fingerprint, inputs, and timestamp. */
|
|
15874
|
+
async writeMetadata(poolDir, fingerprint, templatePath, repos) {
|
|
15875
|
+
const metadata = {
|
|
15876
|
+
fingerprint,
|
|
15877
|
+
templatePath,
|
|
15878
|
+
repos,
|
|
15879
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
15880
|
+
};
|
|
15881
|
+
await (0, import_promises27.writeFile)(import_node_path39.default.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
15882
|
+
}
|
|
15883
|
+
/** Remove all slot directories and their lock files from a pool directory. */
|
|
15884
|
+
async removeAllSlots(poolDir) {
|
|
15885
|
+
const entries = await (0, import_promises27.readdir)(poolDir);
|
|
15886
|
+
for (const entry of entries) {
|
|
15887
|
+
if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
|
|
15888
|
+
const lockPath = import_node_path39.default.join(poolDir, `${entry}.lock`);
|
|
15889
|
+
if ((0, import_node_fs11.existsSync)(lockPath)) {
|
|
15890
|
+
try {
|
|
15891
|
+
const pidStr = await (0, import_promises27.readFile)(lockPath, "utf-8");
|
|
15892
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
15893
|
+
if (!Number.isNaN(pid)) {
|
|
15894
|
+
try {
|
|
15895
|
+
process.kill(pid, 0);
|
|
15896
|
+
console.warn(`[workspace-pool] Skipping slot ${entry}: locked by PID ${pid}`);
|
|
15897
|
+
continue;
|
|
15898
|
+
} catch {
|
|
15899
|
+
}
|
|
15900
|
+
}
|
|
15901
|
+
} catch {
|
|
15902
|
+
}
|
|
15903
|
+
}
|
|
15904
|
+
await (0, import_promises27.rm)(import_node_path39.default.join(poolDir, entry), { recursive: true, force: true });
|
|
15905
|
+
await (0, import_promises27.rm)(lockPath, { force: true }).catch(() => {
|
|
15906
|
+
});
|
|
15907
|
+
}
|
|
15908
|
+
}
|
|
15909
|
+
await (0, import_promises27.rm)(import_node_path39.default.join(poolDir, "metadata.json"), { force: true }).catch(() => {
|
|
15910
|
+
});
|
|
15911
|
+
}
|
|
15912
|
+
/**
|
|
15913
|
+
* Reset an existing slot for reuse:
|
|
15914
|
+
* 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
|
|
15915
|
+
* 2. Re-copy template files (skip repo directories)
|
|
15916
|
+
*/
|
|
15917
|
+
async resetSlot(slotPath, templatePath, repos) {
|
|
15918
|
+
for (const repo of repos) {
|
|
15919
|
+
const repoDir = import_node_path39.default.join(slotPath, repo.path);
|
|
15920
|
+
if (!(0, import_node_fs11.existsSync)(repoDir)) {
|
|
15921
|
+
continue;
|
|
15922
|
+
}
|
|
15923
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
15924
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
15925
|
+
await git(["clean", "-fd"], { cwd: repoDir });
|
|
15926
|
+
}
|
|
15927
|
+
if (templatePath) {
|
|
15928
|
+
const repoDirNames = new Set(
|
|
15929
|
+
repos.map((r) => {
|
|
15930
|
+
const normalized = r.path.replace(/^\.\//, "");
|
|
15931
|
+
return normalized.split("/")[0];
|
|
15932
|
+
})
|
|
15933
|
+
);
|
|
15934
|
+
await copyDirectoryRecursive2(templatePath, slotPath, repoDirNames);
|
|
15935
|
+
}
|
|
15936
|
+
}
|
|
15937
|
+
};
|
|
15938
|
+
|
|
15939
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
15940
|
+
var import_node_child_process8 = require("child_process");
|
|
15941
|
+
var import_node_crypto9 = require("crypto");
|
|
15942
|
+
var import_node_fs12 = require("fs");
|
|
15943
|
+
var import_promises28 = require("fs/promises");
|
|
15944
|
+
var import_node_path40 = __toESM(require("path"), 1);
|
|
15945
|
+
var import_node_util6 = require("util");
|
|
15946
|
+
var execFileAsync2 = (0, import_node_util6.promisify)(import_node_child_process8.execFile);
|
|
15947
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
15948
|
+
var LOCK_TIMEOUT_MS = 6e4;
|
|
15949
|
+
function gitEnv2() {
|
|
15950
|
+
const env = { ...process.env };
|
|
15951
|
+
for (const key of Object.keys(env)) {
|
|
15952
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
15953
|
+
delete env[key];
|
|
15954
|
+
}
|
|
15955
|
+
}
|
|
15956
|
+
return {
|
|
15957
|
+
...env,
|
|
15958
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
15959
|
+
GIT_ASKPASS: "",
|
|
15960
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
15961
|
+
};
|
|
15962
|
+
}
|
|
15963
|
+
function cacheKey(source) {
|
|
15964
|
+
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
15965
|
+
return (0, import_node_crypto9.createHash)("sha256").update(raw).digest("hex");
|
|
15966
|
+
}
|
|
15967
|
+
function getSourceUrl(source) {
|
|
15968
|
+
return source.type === "git" ? source.url : source.path;
|
|
15969
|
+
}
|
|
15970
|
+
async function git2(args, opts) {
|
|
15971
|
+
const { stdout } = await execFileAsync2("git", args, {
|
|
15972
|
+
cwd: opts?.cwd,
|
|
15973
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
15974
|
+
env: gitEnv2(),
|
|
15975
|
+
maxBuffer: 50 * 1024 * 1024
|
|
15976
|
+
// 50MB
|
|
15977
|
+
});
|
|
15978
|
+
return stdout.trim();
|
|
15979
|
+
}
|
|
15980
|
+
async function acquireLock(lockPath) {
|
|
15981
|
+
const start = Date.now();
|
|
15982
|
+
while (Date.now() - start < LOCK_TIMEOUT_MS) {
|
|
15983
|
+
try {
|
|
15984
|
+
await (0, import_promises28.writeFile)(lockPath, String(process.pid), { flag: "wx" });
|
|
15985
|
+
return;
|
|
15986
|
+
} catch (err) {
|
|
15987
|
+
if (err.code === "EEXIST") {
|
|
15988
|
+
await new Promise((r) => setTimeout(r, 200));
|
|
15989
|
+
continue;
|
|
15990
|
+
}
|
|
15991
|
+
throw err;
|
|
15992
|
+
}
|
|
15993
|
+
}
|
|
15994
|
+
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
15995
|
+
}
|
|
15996
|
+
async function releaseLock(lockPath) {
|
|
15997
|
+
try {
|
|
15998
|
+
await (0, import_promises28.unlink)(lockPath);
|
|
15999
|
+
} catch {
|
|
16000
|
+
}
|
|
16001
|
+
}
|
|
16002
|
+
var RepoManager = class {
|
|
16003
|
+
cacheDir;
|
|
16004
|
+
verbose;
|
|
16005
|
+
constructor(cacheDir, verbose = false) {
|
|
16006
|
+
this.cacheDir = cacheDir ?? getGitCacheRoot();
|
|
16007
|
+
this.verbose = verbose;
|
|
16008
|
+
}
|
|
16009
|
+
async runGit(args, opts) {
|
|
16010
|
+
const startedAt = Date.now();
|
|
16011
|
+
if (this.verbose) {
|
|
16012
|
+
console.log(`[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`);
|
|
16013
|
+
}
|
|
16014
|
+
try {
|
|
16015
|
+
const output = await git2(args, opts);
|
|
16016
|
+
if (this.verbose) {
|
|
16017
|
+
console.log(`[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`);
|
|
16018
|
+
}
|
|
16019
|
+
return output;
|
|
16020
|
+
} catch (error) {
|
|
16021
|
+
if (this.verbose) {
|
|
16022
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16023
|
+
console.log(
|
|
16024
|
+
`[repo] git fail durationMs=${Date.now() - startedAt} args=${args.join(" ")} error=${message}`
|
|
16025
|
+
);
|
|
16026
|
+
}
|
|
16027
|
+
throw error;
|
|
15619
16028
|
}
|
|
15620
16029
|
}
|
|
15621
16030
|
/**
|
|
@@ -15625,9 +16034,9 @@ var RepoManager = class {
|
|
|
15625
16034
|
*/
|
|
15626
16035
|
async ensureCache(source, depth, resolve) {
|
|
15627
16036
|
const key = cacheKey(source);
|
|
15628
|
-
const cachePath =
|
|
16037
|
+
const cachePath = import_node_path40.default.join(this.cacheDir, key);
|
|
15629
16038
|
const lockPath = `${cachePath}.lock`;
|
|
15630
|
-
const cacheExists = (0,
|
|
16039
|
+
const cacheExists = (0, import_node_fs12.existsSync)(import_node_path40.default.join(cachePath, "HEAD"));
|
|
15631
16040
|
if (this.verbose) {
|
|
15632
16041
|
console.log(
|
|
15633
16042
|
`[repo] ensureCache source=${getSourceUrl(source)} resolve=${resolve ?? "remote"} cache=${cachePath}`
|
|
@@ -15645,13 +16054,11 @@ var RepoManager = class {
|
|
|
15645
16054
|
`No cache found for \`${url}\`. Run \`agentv cache add --url ${url} --from <local-path>\` to seed it.`
|
|
15646
16055
|
);
|
|
15647
16056
|
}
|
|
15648
|
-
await (0,
|
|
16057
|
+
await (0, import_promises28.mkdir)(this.cacheDir, { recursive: true });
|
|
15649
16058
|
const lockStartedAt = Date.now();
|
|
15650
16059
|
await acquireLock(lockPath);
|
|
15651
16060
|
if (this.verbose) {
|
|
15652
|
-
console.log(
|
|
15653
|
-
`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`
|
|
15654
|
-
);
|
|
16061
|
+
console.log(`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`);
|
|
15655
16062
|
}
|
|
15656
16063
|
try {
|
|
15657
16064
|
if (cacheExists) {
|
|
@@ -15689,7 +16096,7 @@ var RepoManager = class {
|
|
|
15689
16096
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
15690
16097
|
*/
|
|
15691
16098
|
async materialize(repo, workspacePath) {
|
|
15692
|
-
const targetDir =
|
|
16099
|
+
const targetDir = import_node_path40.default.join(workspacePath, repo.path);
|
|
15693
16100
|
const startedAt = Date.now();
|
|
15694
16101
|
if (this.verbose) {
|
|
15695
16102
|
console.log(
|
|
@@ -15784,14 +16191,14 @@ var RepoManager = class {
|
|
|
15784
16191
|
async reset(repos, workspacePath, strategy) {
|
|
15785
16192
|
if (strategy === "recreate") {
|
|
15786
16193
|
for (const repo of repos) {
|
|
15787
|
-
const targetDir =
|
|
15788
|
-
await (0,
|
|
16194
|
+
const targetDir = import_node_path40.default.join(workspacePath, repo.path);
|
|
16195
|
+
await (0, import_promises28.rm)(targetDir, { recursive: true, force: true });
|
|
15789
16196
|
}
|
|
15790
16197
|
await this.materializeAll(repos, workspacePath);
|
|
15791
16198
|
return;
|
|
15792
16199
|
}
|
|
15793
16200
|
for (const repo of repos) {
|
|
15794
|
-
const targetDir =
|
|
16201
|
+
const targetDir = import_node_path40.default.join(workspacePath, repo.path);
|
|
15795
16202
|
await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
15796
16203
|
await this.runGit(["clean", "-fd"], { cwd: targetDir });
|
|
15797
16204
|
}
|
|
@@ -15803,21 +16210,21 @@ var RepoManager = class {
|
|
|
15803
16210
|
async seedCache(localPath, remoteUrl, opts) {
|
|
15804
16211
|
const source = { type: "git", url: remoteUrl };
|
|
15805
16212
|
const key = cacheKey(source);
|
|
15806
|
-
const cachePath =
|
|
16213
|
+
const cachePath = import_node_path40.default.join(this.cacheDir, key);
|
|
15807
16214
|
const lockPath = `${cachePath}.lock`;
|
|
15808
|
-
await (0,
|
|
16215
|
+
await (0, import_promises28.mkdir)(this.cacheDir, { recursive: true });
|
|
15809
16216
|
await acquireLock(lockPath);
|
|
15810
16217
|
try {
|
|
15811
|
-
if ((0,
|
|
16218
|
+
if ((0, import_node_fs12.existsSync)(import_node_path40.default.join(cachePath, "HEAD"))) {
|
|
15812
16219
|
if (!opts?.force) {
|
|
15813
16220
|
throw new Error(
|
|
15814
16221
|
`Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
|
|
15815
16222
|
);
|
|
15816
16223
|
}
|
|
15817
|
-
await (0,
|
|
16224
|
+
await (0, import_promises28.rm)(cachePath, { recursive: true, force: true });
|
|
15818
16225
|
}
|
|
15819
|
-
await
|
|
15820
|
-
await
|
|
16226
|
+
await git2(["clone", "--mirror", "--bare", localPath, cachePath]);
|
|
16227
|
+
await git2(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
|
|
15821
16228
|
} finally {
|
|
15822
16229
|
await releaseLock(lockPath);
|
|
15823
16230
|
}
|
|
@@ -15825,41 +16232,41 @@ var RepoManager = class {
|
|
|
15825
16232
|
}
|
|
15826
16233
|
/** Remove the entire cache directory. */
|
|
15827
16234
|
async cleanCache() {
|
|
15828
|
-
await (0,
|
|
16235
|
+
await (0, import_promises28.rm)(this.cacheDir, { recursive: true, force: true });
|
|
15829
16236
|
}
|
|
15830
16237
|
};
|
|
15831
16238
|
|
|
15832
16239
|
// src/evaluation/workspace/resolve.ts
|
|
15833
|
-
var
|
|
15834
|
-
var
|
|
16240
|
+
var import_promises29 = require("fs/promises");
|
|
16241
|
+
var import_node_path41 = __toESM(require("path"), 1);
|
|
15835
16242
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
15836
16243
|
if (!templatePath) {
|
|
15837
16244
|
return void 0;
|
|
15838
16245
|
}
|
|
15839
|
-
const resolved =
|
|
15840
|
-
const stats = await (0,
|
|
16246
|
+
const resolved = import_node_path41.default.resolve(templatePath);
|
|
16247
|
+
const stats = await (0, import_promises29.stat)(resolved);
|
|
15841
16248
|
if (stats.isFile()) {
|
|
15842
16249
|
return {
|
|
15843
|
-
dir:
|
|
16250
|
+
dir: import_node_path41.default.dirname(resolved),
|
|
15844
16251
|
workspaceFile: resolved
|
|
15845
16252
|
};
|
|
15846
16253
|
}
|
|
15847
16254
|
if (!stats.isDirectory()) {
|
|
15848
16255
|
throw new Error(`workspace template is neither a file nor a directory: ${resolved}`);
|
|
15849
16256
|
}
|
|
15850
|
-
const entries = await (0,
|
|
16257
|
+
const entries = await (0, import_promises29.readdir)(resolved);
|
|
15851
16258
|
const workspaceFiles = entries.filter((e) => e.endsWith(".code-workspace"));
|
|
15852
16259
|
if (workspaceFiles.length === 1) {
|
|
15853
16260
|
return {
|
|
15854
16261
|
dir: resolved,
|
|
15855
|
-
workspaceFile:
|
|
16262
|
+
workspaceFile: import_node_path41.default.join(resolved, workspaceFiles[0])
|
|
15856
16263
|
};
|
|
15857
16264
|
}
|
|
15858
16265
|
if (workspaceFiles.length > 1) {
|
|
15859
16266
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
15860
16267
|
return {
|
|
15861
16268
|
dir: resolved,
|
|
15862
|
-
workspaceFile: conventionFile ?
|
|
16269
|
+
workspaceFile: conventionFile ? import_node_path41.default.join(resolved, conventionFile) : void 0
|
|
15863
16270
|
};
|
|
15864
16271
|
}
|
|
15865
16272
|
return { dir: resolved };
|
|
@@ -15941,7 +16348,10 @@ async function runEvaluation(options) {
|
|
|
15941
16348
|
trials,
|
|
15942
16349
|
streamCallbacks,
|
|
15943
16350
|
totalBudgetUsd,
|
|
15944
|
-
failOnError
|
|
16351
|
+
failOnError,
|
|
16352
|
+
poolWorkspaces,
|
|
16353
|
+
poolMaxSlots: configPoolMaxSlots,
|
|
16354
|
+
workspace: userWorkspacePath
|
|
15945
16355
|
} = options;
|
|
15946
16356
|
let useCache = options.useCache;
|
|
15947
16357
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -15950,7 +16360,7 @@ async function runEvaluation(options) {
|
|
|
15950
16360
|
);
|
|
15951
16361
|
useCache = false;
|
|
15952
16362
|
}
|
|
15953
|
-
const evalRunId = (0,
|
|
16363
|
+
const evalRunId = (0, import_node_crypto10.randomUUID)();
|
|
15954
16364
|
const evalCases = preloadedEvalCases ?? await loadTests(evalFilePath, repoRoot, { verbose, filter });
|
|
15955
16365
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
15956
16366
|
if (filteredEvalCases.length === 0) {
|
|
@@ -16015,7 +16425,7 @@ async function runEvaluation(options) {
|
|
|
16015
16425
|
];
|
|
16016
16426
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
16017
16427
|
const typeRegistry = createBuiltinRegistry();
|
|
16018
|
-
const discoveryBaseDir = evalFilePath ?
|
|
16428
|
+
const discoveryBaseDir = evalFilePath ? import_node_path42.default.dirname(import_node_path42.default.resolve(evalFilePath)) : process.cwd();
|
|
16019
16429
|
const evalDir = discoveryBaseDir;
|
|
16020
16430
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
16021
16431
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
@@ -16077,13 +16487,19 @@ async function runEvaluation(options) {
|
|
|
16077
16487
|
}
|
|
16078
16488
|
};
|
|
16079
16489
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
16080
|
-
|
|
16490
|
+
if (userWorkspacePath && isPerTestIsolation) {
|
|
16491
|
+
throw new Error(
|
|
16492
|
+
"--workspace is incompatible with isolation: per_test. Use isolation: shared (default)."
|
|
16493
|
+
);
|
|
16494
|
+
}
|
|
16495
|
+
const hasSharedWorkspace = !!(userWorkspacePath || workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
16496
|
+
const usePool = poolWorkspaces === true && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !userWorkspacePath;
|
|
16081
16497
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
16082
|
-
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
16498
|
+
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
16083
16499
|
setupLog(
|
|
16084
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
16500
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
16085
16501
|
);
|
|
16086
|
-
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
16502
|
+
if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
|
|
16087
16503
|
console.warn(
|
|
16088
16504
|
`Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
|
|
16089
16505
|
);
|
|
@@ -16092,7 +16508,37 @@ async function runEvaluation(options) {
|
|
|
16092
16508
|
let sharedWorkspacePath;
|
|
16093
16509
|
let sharedBaselineCommit;
|
|
16094
16510
|
let beforeAllOutput;
|
|
16095
|
-
|
|
16511
|
+
let poolManager;
|
|
16512
|
+
let poolSlot;
|
|
16513
|
+
const poolSlots = [];
|
|
16514
|
+
const availablePoolSlots = [];
|
|
16515
|
+
const poolSlotBaselines = /* @__PURE__ */ new Map();
|
|
16516
|
+
const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
|
|
16517
|
+
if (userWorkspacePath) {
|
|
16518
|
+
sharedWorkspacePath = userWorkspacePath;
|
|
16519
|
+
setupLog(`using user-provided workspace: ${userWorkspacePath}`);
|
|
16520
|
+
} else if (usePool && suiteWorkspace?.repos) {
|
|
16521
|
+
const slotsNeeded = workers;
|
|
16522
|
+
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
|
|
16523
|
+
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
|
|
16524
|
+
const poolRepoManager = new RepoManager(void 0, verbose);
|
|
16525
|
+
for (let i = 0; i < slotsNeeded; i++) {
|
|
16526
|
+
const slot = await poolManager.acquireWorkspace({
|
|
16527
|
+
templatePath: workspaceTemplate,
|
|
16528
|
+
repos: suiteWorkspace.repos,
|
|
16529
|
+
maxSlots: poolMaxSlots,
|
|
16530
|
+
repoManager: poolRepoManager
|
|
16531
|
+
});
|
|
16532
|
+
poolSlots.push(slot);
|
|
16533
|
+
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
16534
|
+
}
|
|
16535
|
+
if (slotsNeeded === 1) {
|
|
16536
|
+
poolSlot = poolSlots[0];
|
|
16537
|
+
sharedWorkspacePath = poolSlot.path;
|
|
16538
|
+
} else {
|
|
16539
|
+
availablePoolSlots.push(...poolSlots);
|
|
16540
|
+
}
|
|
16541
|
+
} else if (workspaceTemplate) {
|
|
16096
16542
|
setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
|
|
16097
16543
|
try {
|
|
16098
16544
|
sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
|
|
@@ -16101,288 +16547,344 @@ async function runEvaluation(options) {
|
|
|
16101
16547
|
const message = error instanceof Error ? error.message : String(error);
|
|
16102
16548
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
16103
16549
|
}
|
|
16104
|
-
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
16105
|
-
const copiedWorkspaceFile = import_node_path41.default.join(sharedWorkspacePath, import_node_path41.default.basename(suiteWorkspaceFile));
|
|
16106
|
-
try {
|
|
16107
|
-
await (0, import_promises29.stat)(copiedWorkspaceFile);
|
|
16108
|
-
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
16109
|
-
} catch {
|
|
16110
|
-
}
|
|
16111
|
-
}
|
|
16112
16550
|
} else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
16113
16551
|
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
16114
|
-
await (0,
|
|
16552
|
+
await (0, import_promises30.mkdir)(sharedWorkspacePath, { recursive: true });
|
|
16115
16553
|
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
16116
16554
|
}
|
|
16117
|
-
|
|
16118
|
-
|
|
16119
|
-
|
|
16120
|
-
|
|
16121
|
-
|
|
16122
|
-
|
|
16123
|
-
|
|
16124
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
16125
|
-
if (sharedWorkspacePath) {
|
|
16126
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16127
|
-
});
|
|
16128
|
-
}
|
|
16129
|
-
throw new Error(`Failed to materialize repos: ${message}`);
|
|
16130
|
-
}
|
|
16131
|
-
}
|
|
16132
|
-
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
16133
|
-
const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
|
|
16134
|
-
setupLog(
|
|
16135
|
-
`running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
16136
|
-
);
|
|
16137
|
-
const scriptContext = {
|
|
16138
|
-
workspacePath: sharedWorkspacePath,
|
|
16139
|
-
testId: "__before_all__",
|
|
16140
|
-
evalRunId,
|
|
16141
|
-
evalDir
|
|
16142
|
-
};
|
|
16143
|
-
try {
|
|
16144
|
-
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
16145
|
-
setupLog("shared before_all completed");
|
|
16146
|
-
} catch (error) {
|
|
16147
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
16148
|
-
if (sharedWorkspacePath) {
|
|
16149
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16150
|
-
});
|
|
16555
|
+
try {
|
|
16556
|
+
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
16557
|
+
const copiedWorkspaceFile = import_node_path42.default.join(sharedWorkspacePath, import_node_path42.default.basename(suiteWorkspaceFile));
|
|
16558
|
+
try {
|
|
16559
|
+
await (0, import_promises30.stat)(copiedWorkspaceFile);
|
|
16560
|
+
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
16561
|
+
} catch {
|
|
16151
16562
|
}
|
|
16152
|
-
throw new Error(`before_all script failed: ${message}`);
|
|
16153
16563
|
}
|
|
16154
|
-
|
|
16155
|
-
|
|
16156
|
-
|
|
16157
|
-
|
|
16158
|
-
|
|
16159
|
-
|
|
16160
|
-
|
|
16161
|
-
|
|
16162
|
-
|
|
16163
|
-
|
|
16164
|
-
|
|
16165
|
-
|
|
16166
|
-
let cumulativeBudgetCost = 0;
|
|
16167
|
-
let budgetExhausted = false;
|
|
16168
|
-
let failOnErrorTriggered = false;
|
|
16169
|
-
const promises = filteredEvalCases.map(
|
|
16170
|
-
(evalCase) => limit(async () => {
|
|
16171
|
-
const workerId = nextWorkerId++;
|
|
16172
|
-
workerIdByEvalId.set(evalCase.id, workerId);
|
|
16173
|
-
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
16174
|
-
const budgetResult = {
|
|
16175
|
-
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16176
|
-
testId: evalCase.id,
|
|
16177
|
-
dataset: evalCase.dataset,
|
|
16178
|
-
score: 0,
|
|
16179
|
-
hits: [],
|
|
16180
|
-
misses: [],
|
|
16181
|
-
answer: "",
|
|
16182
|
-
target: target.name,
|
|
16183
|
-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16184
|
-
budgetExceeded: true,
|
|
16185
|
-
executionStatus: "execution_error",
|
|
16186
|
-
failureStage: "setup",
|
|
16187
|
-
failureReasonCode: "budget_exceeded",
|
|
16188
|
-
executionError: {
|
|
16189
|
-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16190
|
-
stage: "setup"
|
|
16191
|
-
}
|
|
16192
|
-
};
|
|
16193
|
-
if (onProgress) {
|
|
16194
|
-
await onProgress({
|
|
16195
|
-
workerId,
|
|
16196
|
-
testId: evalCase.id,
|
|
16197
|
-
status: "failed",
|
|
16198
|
-
completedAt: Date.now(),
|
|
16199
|
-
error: budgetResult.error
|
|
16564
|
+
const repoManager = suiteWorkspace?.repos?.length && !usePool && !userWorkspacePath ? new RepoManager(void 0, verbose) : void 0;
|
|
16565
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
16566
|
+
setupLog(
|
|
16567
|
+
`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
|
|
16568
|
+
);
|
|
16569
|
+
try {
|
|
16570
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
16571
|
+
setupLog("shared repo materialization complete");
|
|
16572
|
+
} catch (error) {
|
|
16573
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16574
|
+
if (sharedWorkspacePath && !userWorkspacePath) {
|
|
16575
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16200
16576
|
});
|
|
16201
16577
|
}
|
|
16202
|
-
|
|
16203
|
-
await onResult(budgetResult);
|
|
16204
|
-
}
|
|
16205
|
-
return budgetResult;
|
|
16578
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
16206
16579
|
}
|
|
16207
|
-
|
|
16208
|
-
|
|
16209
|
-
|
|
16210
|
-
|
|
16211
|
-
|
|
16212
|
-
|
|
16213
|
-
|
|
16214
|
-
|
|
16215
|
-
|
|
16216
|
-
|
|
16217
|
-
|
|
16218
|
-
|
|
16219
|
-
|
|
16220
|
-
|
|
16221
|
-
|
|
16222
|
-
|
|
16223
|
-
|
|
16224
|
-
if (
|
|
16225
|
-
await
|
|
16226
|
-
workerId,
|
|
16227
|
-
testId: evalCase.id,
|
|
16228
|
-
status: "failed",
|
|
16229
|
-
completedAt: Date.now(),
|
|
16230
|
-
error: haltResult.error
|
|
16580
|
+
}
|
|
16581
|
+
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
16582
|
+
const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
|
|
16583
|
+
setupLog(
|
|
16584
|
+
`running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
16585
|
+
);
|
|
16586
|
+
const scriptContext = {
|
|
16587
|
+
workspacePath: sharedWorkspacePath,
|
|
16588
|
+
testId: "__before_all__",
|
|
16589
|
+
evalRunId,
|
|
16590
|
+
evalDir
|
|
16591
|
+
};
|
|
16592
|
+
try {
|
|
16593
|
+
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
16594
|
+
setupLog("shared before_all completed");
|
|
16595
|
+
} catch (error) {
|
|
16596
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16597
|
+
if (sharedWorkspacePath && !userWorkspacePath) {
|
|
16598
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16231
16599
|
});
|
|
16232
16600
|
}
|
|
16233
|
-
|
|
16234
|
-
await onResult(haltResult);
|
|
16235
|
-
}
|
|
16236
|
-
return haltResult;
|
|
16601
|
+
throw new Error(`before_all script failed: ${message}`);
|
|
16237
16602
|
}
|
|
16238
|
-
|
|
16239
|
-
|
|
16240
|
-
|
|
16241
|
-
|
|
16242
|
-
|
|
16243
|
-
|
|
16244
|
-
|
|
16245
|
-
}
|
|
16246
|
-
try {
|
|
16247
|
-
const judgeProvider = await resolveJudgeProvider(target);
|
|
16248
|
-
const runCaseOptions = {
|
|
16249
|
-
evalCase,
|
|
16250
|
-
provider: primaryProvider,
|
|
16251
|
-
target,
|
|
16252
|
-
evaluators: evaluatorRegistry,
|
|
16253
|
-
maxRetries,
|
|
16254
|
-
agentTimeoutMs,
|
|
16255
|
-
cache,
|
|
16256
|
-
useCache,
|
|
16257
|
-
now,
|
|
16258
|
-
judgeProvider,
|
|
16259
|
-
targetResolver,
|
|
16260
|
-
availableTargets,
|
|
16603
|
+
}
|
|
16604
|
+
if (availablePoolSlots.length > 0 && suiteWorkspace?.before_all) {
|
|
16605
|
+
for (const slot of availablePoolSlots) {
|
|
16606
|
+
setupLog(`running before_all on pool slot ${slot.index}`);
|
|
16607
|
+
const scriptContext = {
|
|
16608
|
+
workspacePath: slot.path,
|
|
16609
|
+
testId: "__before_all__",
|
|
16261
16610
|
evalRunId,
|
|
16262
|
-
keepWorkspaces,
|
|
16263
|
-
cleanupWorkspaces,
|
|
16264
|
-
sharedWorkspacePath,
|
|
16265
|
-
sharedBaselineCommit,
|
|
16266
|
-
suiteWorkspaceFile,
|
|
16267
|
-
streamCallbacks,
|
|
16268
|
-
typeRegistry,
|
|
16269
|
-
repoManager,
|
|
16270
16611
|
evalDir
|
|
16271
16612
|
};
|
|
16272
|
-
|
|
16273
|
-
|
|
16274
|
-
|
|
16275
|
-
|
|
16276
|
-
|
|
16277
|
-
|
|
16278
|
-
|
|
16613
|
+
try {
|
|
16614
|
+
const output = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
16615
|
+
if (!beforeAllOutput) beforeAllOutput = output;
|
|
16616
|
+
setupLog(`before_all completed on pool slot ${slot.index}`);
|
|
16617
|
+
} catch (error) {
|
|
16618
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16619
|
+
throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
|
|
16620
|
+
}
|
|
16621
|
+
}
|
|
16622
|
+
}
|
|
16623
|
+
if (sharedWorkspacePath) {
|
|
16624
|
+
try {
|
|
16625
|
+
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
16626
|
+
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
16627
|
+
} catch {
|
|
16628
|
+
setupLog("shared baseline initialization skipped (non-fatal)");
|
|
16629
|
+
}
|
|
16630
|
+
}
|
|
16631
|
+
if (availablePoolSlots.length > 0) {
|
|
16632
|
+
for (const slot of availablePoolSlots) {
|
|
16633
|
+
try {
|
|
16634
|
+
const baseline = await initializeBaseline(slot.path);
|
|
16635
|
+
poolSlotBaselines.set(slot.path, baseline);
|
|
16636
|
+
setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
|
|
16637
|
+
} catch {
|
|
16638
|
+
setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`);
|
|
16639
|
+
}
|
|
16640
|
+
}
|
|
16641
|
+
}
|
|
16642
|
+
let nextWorkerId = 1;
|
|
16643
|
+
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
16644
|
+
let beforeAllOutputAttached = false;
|
|
16645
|
+
let cumulativeBudgetCost = 0;
|
|
16646
|
+
let budgetExhausted = false;
|
|
16647
|
+
let failOnErrorTriggered = false;
|
|
16648
|
+
const promises = filteredEvalCases.map(
|
|
16649
|
+
(evalCase) => limit(async () => {
|
|
16650
|
+
const workerId = nextWorkerId++;
|
|
16651
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
16652
|
+
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
16653
|
+
const budgetResult = {
|
|
16654
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16655
|
+
testId: evalCase.id,
|
|
16656
|
+
dataset: evalCase.dataset,
|
|
16657
|
+
score: 0,
|
|
16658
|
+
hits: [],
|
|
16659
|
+
misses: [],
|
|
16660
|
+
answer: "",
|
|
16661
|
+
target: target.name,
|
|
16662
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16663
|
+
budgetExceeded: true,
|
|
16664
|
+
executionStatus: "execution_error",
|
|
16665
|
+
failureStage: "setup",
|
|
16666
|
+
failureReasonCode: "budget_exceeded",
|
|
16667
|
+
executionError: {
|
|
16668
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16669
|
+
stage: "setup"
|
|
16279
16670
|
}
|
|
16280
|
-
}
|
|
16281
|
-
|
|
16671
|
+
};
|
|
16672
|
+
if (onProgress) {
|
|
16673
|
+
await onProgress({
|
|
16674
|
+
workerId,
|
|
16675
|
+
testId: evalCase.id,
|
|
16676
|
+
status: "failed",
|
|
16677
|
+
completedAt: Date.now(),
|
|
16678
|
+
error: budgetResult.error
|
|
16679
|
+
});
|
|
16282
16680
|
}
|
|
16283
|
-
if (
|
|
16284
|
-
|
|
16285
|
-
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
16286
|
-
budgetExhausted = true;
|
|
16287
|
-
}
|
|
16681
|
+
if (onResult) {
|
|
16682
|
+
await onResult(budgetResult);
|
|
16288
16683
|
}
|
|
16684
|
+
return budgetResult;
|
|
16289
16685
|
}
|
|
16290
|
-
if (failOnError === true &&
|
|
16291
|
-
|
|
16292
|
-
|
|
16293
|
-
|
|
16294
|
-
|
|
16295
|
-
|
|
16686
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
16687
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
16688
|
+
const haltResult = {
|
|
16689
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16690
|
+
testId: evalCase.id,
|
|
16691
|
+
dataset: evalCase.dataset,
|
|
16692
|
+
score: 0,
|
|
16693
|
+
hits: [],
|
|
16694
|
+
misses: [],
|
|
16695
|
+
answer: "",
|
|
16696
|
+
target: target.name,
|
|
16697
|
+
error: errorMsg,
|
|
16698
|
+
executionStatus: "execution_error",
|
|
16699
|
+
failureStage: "setup",
|
|
16700
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
16701
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
16702
|
+
};
|
|
16703
|
+
if (onProgress) {
|
|
16704
|
+
await onProgress({
|
|
16705
|
+
workerId,
|
|
16706
|
+
testId: evalCase.id,
|
|
16707
|
+
status: "failed",
|
|
16708
|
+
completedAt: Date.now(),
|
|
16709
|
+
error: haltResult.error
|
|
16710
|
+
});
|
|
16711
|
+
}
|
|
16712
|
+
if (onResult) {
|
|
16713
|
+
await onResult(haltResult);
|
|
16714
|
+
}
|
|
16715
|
+
return haltResult;
|
|
16296
16716
|
}
|
|
16297
16717
|
if (onProgress) {
|
|
16298
16718
|
await onProgress({
|
|
16299
16719
|
workerId,
|
|
16300
16720
|
testId: evalCase.id,
|
|
16301
|
-
status:
|
|
16302
|
-
startedAt:
|
|
16303
|
-
// Not used for completed status
|
|
16304
|
-
completedAt: Date.now(),
|
|
16305
|
-
error: result.error
|
|
16721
|
+
status: "running",
|
|
16722
|
+
startedAt: Date.now()
|
|
16306
16723
|
});
|
|
16307
16724
|
}
|
|
16308
|
-
|
|
16309
|
-
|
|
16725
|
+
const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : void 0;
|
|
16726
|
+
const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath;
|
|
16727
|
+
const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit;
|
|
16728
|
+
try {
|
|
16729
|
+
const judgeProvider = await resolveJudgeProvider(target);
|
|
16730
|
+
const runCaseOptions = {
|
|
16731
|
+
evalCase,
|
|
16732
|
+
provider: primaryProvider,
|
|
16733
|
+
target,
|
|
16734
|
+
evaluators: evaluatorRegistry,
|
|
16735
|
+
maxRetries,
|
|
16736
|
+
agentTimeoutMs,
|
|
16737
|
+
cache,
|
|
16738
|
+
useCache,
|
|
16739
|
+
now,
|
|
16740
|
+
judgeProvider,
|
|
16741
|
+
targetResolver,
|
|
16742
|
+
availableTargets,
|
|
16743
|
+
evalRunId,
|
|
16744
|
+
keepWorkspaces,
|
|
16745
|
+
cleanupWorkspaces,
|
|
16746
|
+
sharedWorkspacePath: testWorkspacePath,
|
|
16747
|
+
sharedBaselineCommit: testBaselineCommit,
|
|
16748
|
+
suiteWorkspaceFile,
|
|
16749
|
+
streamCallbacks,
|
|
16750
|
+
typeRegistry,
|
|
16751
|
+
repoManager,
|
|
16752
|
+
evalDir
|
|
16753
|
+
};
|
|
16754
|
+
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
16755
|
+
if (totalBudgetUsd !== void 0) {
|
|
16756
|
+
let caseCost;
|
|
16757
|
+
if (result.trials && result.trials.length > 0) {
|
|
16758
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
16759
|
+
if (trialCostSum > 0) {
|
|
16760
|
+
caseCost = trialCostSum;
|
|
16761
|
+
}
|
|
16762
|
+
} else {
|
|
16763
|
+
caseCost = result.costUsd;
|
|
16764
|
+
}
|
|
16765
|
+
if (caseCost !== void 0) {
|
|
16766
|
+
cumulativeBudgetCost += caseCost;
|
|
16767
|
+
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
16768
|
+
budgetExhausted = true;
|
|
16769
|
+
}
|
|
16770
|
+
}
|
|
16771
|
+
}
|
|
16772
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
16773
|
+
failOnErrorTriggered = true;
|
|
16774
|
+
}
|
|
16775
|
+
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
16776
|
+
result = { ...result, beforeAllOutput };
|
|
16777
|
+
beforeAllOutputAttached = true;
|
|
16778
|
+
}
|
|
16779
|
+
if (onProgress) {
|
|
16780
|
+
await onProgress({
|
|
16781
|
+
workerId,
|
|
16782
|
+
testId: evalCase.id,
|
|
16783
|
+
status: result.error ? "failed" : "completed",
|
|
16784
|
+
startedAt: 0,
|
|
16785
|
+
// Not used for completed status
|
|
16786
|
+
completedAt: Date.now(),
|
|
16787
|
+
error: result.error
|
|
16788
|
+
});
|
|
16789
|
+
}
|
|
16790
|
+
if (onResult) {
|
|
16791
|
+
await onResult(result);
|
|
16792
|
+
}
|
|
16793
|
+
return result;
|
|
16794
|
+
} catch (error) {
|
|
16795
|
+
if (onProgress) {
|
|
16796
|
+
await onProgress({
|
|
16797
|
+
workerId,
|
|
16798
|
+
testId: evalCase.id,
|
|
16799
|
+
status: "failed",
|
|
16800
|
+
completedAt: Date.now(),
|
|
16801
|
+
error: error instanceof Error ? error.message : String(error)
|
|
16802
|
+
});
|
|
16803
|
+
}
|
|
16804
|
+
throw error;
|
|
16805
|
+
} finally {
|
|
16806
|
+
if (testPoolSlot) {
|
|
16807
|
+
availablePoolSlots.push(testPoolSlot);
|
|
16808
|
+
}
|
|
16310
16809
|
}
|
|
16311
|
-
|
|
16312
|
-
|
|
16313
|
-
|
|
16314
|
-
|
|
16315
|
-
|
|
16316
|
-
|
|
16317
|
-
|
|
16318
|
-
|
|
16319
|
-
|
|
16320
|
-
|
|
16810
|
+
})
|
|
16811
|
+
);
|
|
16812
|
+
const settled = await Promise.allSettled(promises);
|
|
16813
|
+
const results = [];
|
|
16814
|
+
for (let i = 0; i < settled.length; i++) {
|
|
16815
|
+
const outcome = settled[i];
|
|
16816
|
+
if (outcome.status === "fulfilled") {
|
|
16817
|
+
results.push(outcome.value);
|
|
16818
|
+
} else {
|
|
16819
|
+
const evalCase = filteredEvalCases[i];
|
|
16820
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
16821
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
16822
|
+
const errorResult = buildErrorResult(
|
|
16823
|
+
evalCase,
|
|
16824
|
+
target.name,
|
|
16825
|
+
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
16826
|
+
outcome.reason,
|
|
16827
|
+
promptInputs,
|
|
16828
|
+
primaryProvider,
|
|
16829
|
+
"agent",
|
|
16830
|
+
"provider_error"
|
|
16831
|
+
);
|
|
16832
|
+
results.push(errorResult);
|
|
16833
|
+
if (onResult) {
|
|
16834
|
+
await onResult(errorResult);
|
|
16321
16835
|
}
|
|
16322
|
-
throw error;
|
|
16323
16836
|
}
|
|
16324
|
-
}
|
|
16325
|
-
|
|
16326
|
-
|
|
16327
|
-
|
|
16328
|
-
|
|
16329
|
-
|
|
16330
|
-
|
|
16331
|
-
|
|
16332
|
-
|
|
16333
|
-
|
|
16334
|
-
|
|
16335
|
-
|
|
16336
|
-
|
|
16337
|
-
|
|
16338
|
-
|
|
16339
|
-
|
|
16340
|
-
|
|
16341
|
-
|
|
16342
|
-
|
|
16343
|
-
|
|
16344
|
-
|
|
16345
|
-
);
|
|
16346
|
-
results.push(errorResult);
|
|
16347
|
-
if (onResult) {
|
|
16348
|
-
await onResult(errorResult);
|
|
16837
|
+
}
|
|
16838
|
+
const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
|
|
16839
|
+
if (afterAllWorkspaces.length > 0 && suiteWorkspace?.after_all) {
|
|
16840
|
+
for (const wsPath of afterAllWorkspaces) {
|
|
16841
|
+
const scriptContext = {
|
|
16842
|
+
workspacePath: wsPath,
|
|
16843
|
+
testId: "__after_all__",
|
|
16844
|
+
evalRunId,
|
|
16845
|
+
evalDir
|
|
16846
|
+
};
|
|
16847
|
+
try {
|
|
16848
|
+
const afterAllOutput = await executeWorkspaceScript(
|
|
16849
|
+
suiteWorkspace.after_all,
|
|
16850
|
+
scriptContext,
|
|
16851
|
+
"warn"
|
|
16852
|
+
);
|
|
16853
|
+
if (afterAllOutput && results.length > 0 && wsPath === afterAllWorkspaces[0]) {
|
|
16854
|
+
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
16855
|
+
}
|
|
16856
|
+
} catch {
|
|
16857
|
+
}
|
|
16349
16858
|
}
|
|
16350
16859
|
}
|
|
16351
|
-
|
|
16352
|
-
|
|
16353
|
-
|
|
16354
|
-
|
|
16355
|
-
|
|
16356
|
-
|
|
16357
|
-
|
|
16358
|
-
|
|
16359
|
-
try {
|
|
16360
|
-
const afterAllOutput = await executeWorkspaceScript(
|
|
16361
|
-
suiteWorkspace.after_all,
|
|
16362
|
-
scriptContext,
|
|
16363
|
-
"warn"
|
|
16364
|
-
);
|
|
16365
|
-
if (afterAllOutput && results.length > 0) {
|
|
16366
|
-
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
16860
|
+
if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !userWorkspacePath) {
|
|
16861
|
+
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
16862
|
+
if (cleanupWorkspaces) {
|
|
16863
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16864
|
+
});
|
|
16865
|
+
} else if (!hasFailure && !keepWorkspaces) {
|
|
16866
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16867
|
+
});
|
|
16367
16868
|
}
|
|
16368
|
-
} catch {
|
|
16369
16869
|
}
|
|
16370
|
-
}
|
|
16371
|
-
if (sharedWorkspacePath) {
|
|
16372
|
-
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
16373
16870
|
if (cleanupWorkspaces) {
|
|
16374
|
-
await
|
|
16375
|
-
});
|
|
16376
|
-
} else if (!hasFailure && !keepWorkspaces) {
|
|
16377
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16871
|
+
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
16378
16872
|
});
|
|
16379
16873
|
}
|
|
16874
|
+
return results;
|
|
16875
|
+
} finally {
|
|
16876
|
+
if (poolManager) {
|
|
16877
|
+
if (poolSlot) {
|
|
16878
|
+
await poolManager.releaseSlot(poolSlot);
|
|
16879
|
+
}
|
|
16880
|
+
for (const slot of poolSlots) {
|
|
16881
|
+
if (slot !== poolSlot) {
|
|
16882
|
+
await poolManager.releaseSlot(slot).catch(() => {
|
|
16883
|
+
});
|
|
16884
|
+
}
|
|
16885
|
+
}
|
|
16886
|
+
}
|
|
16380
16887
|
}
|
|
16381
|
-
if (cleanupWorkspaces) {
|
|
16382
|
-
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
16383
|
-
});
|
|
16384
|
-
}
|
|
16385
|
-
return results;
|
|
16386
16888
|
}
|
|
16387
16889
|
async function runBatchEvaluation(options) {
|
|
16388
16890
|
const {
|
|
@@ -16599,9 +17101,9 @@ async function runEvalCase(options) {
|
|
|
16599
17101
|
);
|
|
16600
17102
|
}
|
|
16601
17103
|
if (caseWorkspaceFile && workspacePath) {
|
|
16602
|
-
const copiedFile =
|
|
17104
|
+
const copiedFile = import_node_path42.default.join(workspacePath, import_node_path42.default.basename(caseWorkspaceFile));
|
|
16603
17105
|
try {
|
|
16604
|
-
await (0,
|
|
17106
|
+
await (0, import_promises30.stat)(copiedFile);
|
|
16605
17107
|
caseWorkspaceFile = copiedFile;
|
|
16606
17108
|
} catch {
|
|
16607
17109
|
}
|
|
@@ -16609,7 +17111,7 @@ async function runEvalCase(options) {
|
|
|
16609
17111
|
}
|
|
16610
17112
|
if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
16611
17113
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
16612
|
-
await (0,
|
|
17114
|
+
await (0, import_promises30.mkdir)(workspacePath, { recursive: true });
|
|
16613
17115
|
}
|
|
16614
17116
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
16615
17117
|
const perCaseRepoManager = new RepoManager(void 0, setupDebug);
|
|
@@ -17209,7 +17711,7 @@ async function runEvaluatorList(options) {
|
|
|
17209
17711
|
fileChanges,
|
|
17210
17712
|
workspacePath
|
|
17211
17713
|
};
|
|
17212
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
17714
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path42.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
17213
17715
|
const dispatchContext = {
|
|
17214
17716
|
judgeProvider,
|
|
17215
17717
|
targetResolver,
|
|
@@ -17443,7 +17945,7 @@ function extractProviderError(response) {
|
|
|
17443
17945
|
return trimmed.length > 0 ? trimmed : void 0;
|
|
17444
17946
|
}
|
|
17445
17947
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
17446
|
-
const hash = (0,
|
|
17948
|
+
const hash = (0, import_node_crypto10.createHash)("sha256");
|
|
17447
17949
|
hash.update(provider.id);
|
|
17448
17950
|
hash.update(target.name);
|
|
17449
17951
|
hash.update(evalCase.id);
|
|
@@ -17511,8 +18013,8 @@ function computeWeightedMean(entries) {
|
|
|
17511
18013
|
}
|
|
17512
18014
|
|
|
17513
18015
|
// src/evaluation/evaluate.ts
|
|
17514
|
-
var
|
|
17515
|
-
var
|
|
18016
|
+
var import_node_fs13 = require("fs");
|
|
18017
|
+
var import_node_path43 = __toESM(require("path"), 1);
|
|
17516
18018
|
async function evaluate(config) {
|
|
17517
18019
|
const startTime = Date.now();
|
|
17518
18020
|
if (config.tests && config.specFile) {
|
|
@@ -17534,13 +18036,13 @@ async function evaluate(config) {
|
|
|
17534
18036
|
let evalCases;
|
|
17535
18037
|
let testFilePath;
|
|
17536
18038
|
if (config.specFile) {
|
|
17537
|
-
testFilePath =
|
|
18039
|
+
testFilePath = import_node_path43.default.resolve(config.specFile);
|
|
17538
18040
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
17539
18041
|
verbose: config.verbose,
|
|
17540
18042
|
filter: config.filter
|
|
17541
18043
|
});
|
|
17542
18044
|
} else {
|
|
17543
|
-
testFilePath =
|
|
18045
|
+
testFilePath = import_node_path43.default.join(process.cwd(), "__programmatic__.yaml");
|
|
17544
18046
|
evalCases = (config.tests ?? []).map((test) => {
|
|
17545
18047
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
17546
18048
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -17626,11 +18128,11 @@ function computeSummary(results, durationMs) {
|
|
|
17626
18128
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
17627
18129
|
async function discoverDefaultTarget(repoRoot) {
|
|
17628
18130
|
const cwd = process.cwd();
|
|
17629
|
-
const chain = buildDirectoryChain2(
|
|
18131
|
+
const chain = buildDirectoryChain2(import_node_path43.default.join(cwd, "_placeholder"), repoRoot);
|
|
17630
18132
|
for (const dir of chain) {
|
|
17631
18133
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
17632
|
-
const targetsPath =
|
|
17633
|
-
if (!(0,
|
|
18134
|
+
const targetsPath = import_node_path43.default.join(dir, candidate);
|
|
18135
|
+
if (!(0, import_node_fs13.existsSync)(targetsPath)) continue;
|
|
17634
18136
|
try {
|
|
17635
18137
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
17636
18138
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -17644,11 +18146,11 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
17644
18146
|
async function loadEnvHierarchy(repoRoot) {
|
|
17645
18147
|
const { readFileSync: readFileSync2 } = await import("fs");
|
|
17646
18148
|
const cwd = process.cwd();
|
|
17647
|
-
const chain = buildDirectoryChain2(
|
|
18149
|
+
const chain = buildDirectoryChain2(import_node_path43.default.join(cwd, "_placeholder"), repoRoot);
|
|
17648
18150
|
const envFiles = [];
|
|
17649
18151
|
for (const dir of chain) {
|
|
17650
|
-
const envPath =
|
|
17651
|
-
if ((0,
|
|
18152
|
+
const envPath = import_node_path43.default.join(dir, ".env");
|
|
18153
|
+
if ((0, import_node_fs13.existsSync)(envPath)) envFiles.push(envPath);
|
|
17652
18154
|
}
|
|
17653
18155
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
17654
18156
|
try {
|
|
@@ -17726,12 +18228,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
17726
18228
|
".agentv/config.js"
|
|
17727
18229
|
];
|
|
17728
18230
|
async function loadTsConfig(projectRoot) {
|
|
17729
|
-
const { existsSync:
|
|
18231
|
+
const { existsSync: existsSync5 } = await import("fs");
|
|
17730
18232
|
const { pathToFileURL } = await import("url");
|
|
17731
18233
|
const { join: join2 } = await import("path");
|
|
17732
18234
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
17733
18235
|
const filePath = join2(projectRoot, fileName);
|
|
17734
|
-
if (!
|
|
18236
|
+
if (!existsSync5(filePath)) {
|
|
17735
18237
|
continue;
|
|
17736
18238
|
}
|
|
17737
18239
|
try {
|
|
@@ -17828,8 +18330,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
17828
18330
|
}
|
|
17829
18331
|
|
|
17830
18332
|
// src/evaluation/cache/response-cache.ts
|
|
17831
|
-
var
|
|
17832
|
-
var
|
|
18333
|
+
var import_promises31 = require("fs/promises");
|
|
18334
|
+
var import_node_path44 = __toESM(require("path"), 1);
|
|
17833
18335
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
17834
18336
|
var ResponseCache = class {
|
|
17835
18337
|
cachePath;
|
|
@@ -17839,7 +18341,7 @@ var ResponseCache = class {
|
|
|
17839
18341
|
async get(key) {
|
|
17840
18342
|
const filePath = this.keyToPath(key);
|
|
17841
18343
|
try {
|
|
17842
|
-
const data = await (0,
|
|
18344
|
+
const data = await (0, import_promises31.readFile)(filePath, "utf8");
|
|
17843
18345
|
return JSON.parse(data);
|
|
17844
18346
|
} catch {
|
|
17845
18347
|
return void 0;
|
|
@@ -17847,13 +18349,13 @@ var ResponseCache = class {
|
|
|
17847
18349
|
}
|
|
17848
18350
|
async set(key, value) {
|
|
17849
18351
|
const filePath = this.keyToPath(key);
|
|
17850
|
-
const dir =
|
|
17851
|
-
await (0,
|
|
17852
|
-
await (0,
|
|
18352
|
+
const dir = import_node_path44.default.dirname(filePath);
|
|
18353
|
+
await (0, import_promises31.mkdir)(dir, { recursive: true });
|
|
18354
|
+
await (0, import_promises31.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
17853
18355
|
}
|
|
17854
18356
|
keyToPath(key) {
|
|
17855
18357
|
const prefix = key.slice(0, 2);
|
|
17856
|
-
return
|
|
18358
|
+
return import_node_path44.default.join(this.cachePath, prefix, `${key}.json`);
|
|
17857
18359
|
}
|
|
17858
18360
|
};
|
|
17859
18361
|
function shouldEnableCache(params) {
|
|
@@ -18340,6 +18842,7 @@ function createAgentKernel() {
|
|
|
18340
18842
|
TokenUsageEvaluator,
|
|
18341
18843
|
ToolTrajectoryEvaluator,
|
|
18342
18844
|
WorkspaceCreationError,
|
|
18845
|
+
WorkspacePoolManager,
|
|
18343
18846
|
assembleLlmJudgePrompt,
|
|
18344
18847
|
avgToolDurationMs,
|
|
18345
18848
|
buildDirectoryChain,
|
|
@@ -18354,6 +18857,7 @@ function createAgentKernel() {
|
|
|
18354
18857
|
cleanupEvalWorkspaces,
|
|
18355
18858
|
cleanupWorkspace,
|
|
18356
18859
|
computeTraceSummary,
|
|
18860
|
+
computeWorkspaceFingerprint,
|
|
18357
18861
|
consumeClaudeLogEntries,
|
|
18358
18862
|
consumeCodexLogEntries,
|
|
18359
18863
|
consumeCopilotCliLogEntries,
|
|
@@ -18391,6 +18895,7 @@ function createAgentKernel() {
|
|
|
18391
18895
|
getSubagentsRoot,
|
|
18392
18896
|
getTraceStateRoot,
|
|
18393
18897
|
getWorkspacePath,
|
|
18898
|
+
getWorkspacePoolRoot,
|
|
18394
18899
|
getWorkspacesRoot,
|
|
18395
18900
|
initializeBaseline,
|
|
18396
18901
|
isEvaluatorKind,
|