@agentv/core 2.14.3 → 2.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-N55K52OO.js → chunk-E6AJPAXM.js} +1 -1
- package/dist/chunk-E6AJPAXM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +8 -7
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +9 -8
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1079 -610
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +139 -34
- package/dist/index.d.ts +139 -34
- package/dist/index.js +1074 -607
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-N55K52OO.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1244,12 +1244,12 @@ function serializeAttributeValue(value) {
|
|
|
1244
1244
|
if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
|
|
1245
1245
|
return { stringValue: String(value) };
|
|
1246
1246
|
}
|
|
1247
|
-
var import_promises31,
|
|
1247
|
+
var import_promises31, import_node_path45, OtlpJsonFileExporter;
|
|
1248
1248
|
var init_otlp_json_file_exporter = __esm({
|
|
1249
1249
|
"src/observability/otlp-json-file-exporter.ts"() {
|
|
1250
1250
|
"use strict";
|
|
1251
1251
|
import_promises31 = require("fs/promises");
|
|
1252
|
-
|
|
1252
|
+
import_node_path45 = require("path");
|
|
1253
1253
|
OtlpJsonFileExporter = class {
|
|
1254
1254
|
// biome-ignore lint/suspicious/noExplicitAny: serialized span data
|
|
1255
1255
|
spans = [];
|
|
@@ -1288,7 +1288,7 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1288
1288
|
}
|
|
1289
1289
|
async flush() {
|
|
1290
1290
|
if (this.spans.length === 0) return;
|
|
1291
|
-
await (0, import_promises31.mkdir)((0,
|
|
1291
|
+
await (0, import_promises31.mkdir)((0, import_node_path45.dirname)(this.filePath), { recursive: true });
|
|
1292
1292
|
const otlpJson = {
|
|
1293
1293
|
resourceSpans: [
|
|
1294
1294
|
{
|
|
@@ -1319,13 +1319,13 @@ function hrTimeDiffMs(start, end) {
|
|
|
1319
1319
|
const diffNano = end[1] - start[1];
|
|
1320
1320
|
return Math.round(diffSec * 1e3 + diffNano / 1e6);
|
|
1321
1321
|
}
|
|
1322
|
-
var import_node_fs13, import_promises32,
|
|
1322
|
+
var import_node_fs13, import_promises32, import_node_path46, SimpleTraceFileExporter;
|
|
1323
1323
|
var init_simple_trace_file_exporter = __esm({
|
|
1324
1324
|
"src/observability/simple-trace-file-exporter.ts"() {
|
|
1325
1325
|
"use strict";
|
|
1326
1326
|
import_node_fs13 = require("fs");
|
|
1327
1327
|
import_promises32 = require("fs/promises");
|
|
1328
|
-
|
|
1328
|
+
import_node_path46 = require("path");
|
|
1329
1329
|
SimpleTraceFileExporter = class {
|
|
1330
1330
|
stream = null;
|
|
1331
1331
|
filePath;
|
|
@@ -1338,7 +1338,7 @@ var init_simple_trace_file_exporter = __esm({
|
|
|
1338
1338
|
async ensureStream() {
|
|
1339
1339
|
if (!this.streamReady) {
|
|
1340
1340
|
this.streamReady = (async () => {
|
|
1341
|
-
await (0, import_promises32.mkdir)((0,
|
|
1341
|
+
await (0, import_promises32.mkdir)((0, import_node_path46.dirname)(this.filePath), { recursive: true });
|
|
1342
1342
|
this.stream = (0, import_node_fs13.createWriteStream)(this.filePath, { flags: "w" });
|
|
1343
1343
|
return this.stream;
|
|
1344
1344
|
})();
|
|
@@ -1457,6 +1457,7 @@ __export(index_exports, {
|
|
|
1457
1457
|
TokenUsageEvaluator: () => TokenUsageEvaluator,
|
|
1458
1458
|
ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
|
|
1459
1459
|
WorkspaceCreationError: () => WorkspaceCreationError,
|
|
1460
|
+
WorkspacePoolManager: () => WorkspacePoolManager,
|
|
1460
1461
|
assembleLlmJudgePrompt: () => assembleLlmJudgePrompt,
|
|
1461
1462
|
avgToolDurationMs: () => avgToolDurationMs,
|
|
1462
1463
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
@@ -1471,6 +1472,7 @@ __export(index_exports, {
|
|
|
1471
1472
|
cleanupEvalWorkspaces: () => cleanupEvalWorkspaces,
|
|
1472
1473
|
cleanupWorkspace: () => cleanupWorkspace,
|
|
1473
1474
|
computeTraceSummary: () => computeTraceSummary,
|
|
1475
|
+
computeWorkspaceFingerprint: () => computeWorkspaceFingerprint,
|
|
1474
1476
|
consumeClaudeLogEntries: () => consumeClaudeLogEntries,
|
|
1475
1477
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
1476
1478
|
consumeCopilotCliLogEntries: () => consumeCopilotCliLogEntries,
|
|
@@ -1503,11 +1505,11 @@ __export(index_exports, {
|
|
|
1503
1505
|
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
1504
1506
|
generateRubrics: () => generateRubrics,
|
|
1505
1507
|
getAgentvHome: () => getAgentvHome,
|
|
1506
|
-
getGitCacheRoot: () => getGitCacheRoot,
|
|
1507
1508
|
getHitCount: () => getHitCount,
|
|
1508
1509
|
getSubagentsRoot: () => getSubagentsRoot,
|
|
1509
1510
|
getTraceStateRoot: () => getTraceStateRoot,
|
|
1510
1511
|
getWorkspacePath: () => getWorkspacePath,
|
|
1512
|
+
getWorkspacePoolRoot: () => getWorkspacePoolRoot,
|
|
1511
1513
|
getWorkspacesRoot: () => getWorkspacesRoot,
|
|
1512
1514
|
initializeBaseline: () => initializeBaseline,
|
|
1513
1515
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
@@ -2236,6 +2238,17 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
2236
2238
|
} else if (otelFile !== void 0) {
|
|
2237
2239
|
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
2238
2240
|
}
|
|
2241
|
+
if (typeof obj.pool_workspaces === "boolean") {
|
|
2242
|
+
result.pool_workspaces = obj.pool_workspaces;
|
|
2243
|
+
} else if (obj.pool_workspaces !== void 0) {
|
|
2244
|
+
logWarning(`Invalid execution.pool_workspaces in ${configPath}, expected boolean`);
|
|
2245
|
+
}
|
|
2246
|
+
const poolSlots = obj.pool_slots;
|
|
2247
|
+
if (typeof poolSlots === "number" && Number.isInteger(poolSlots) && poolSlots >= 1 && poolSlots <= 50) {
|
|
2248
|
+
result.pool_slots = poolSlots;
|
|
2249
|
+
} else if (poolSlots !== void 0) {
|
|
2250
|
+
logWarning(`Invalid execution.pool_slots in ${configPath}, expected integer 1-50`);
|
|
2251
|
+
}
|
|
2239
2252
|
return Object.keys(result).length > 0 ? result : void 0;
|
|
2240
2253
|
}
|
|
2241
2254
|
function logWarning(message) {
|
|
@@ -3677,6 +3690,7 @@ async function processMessages(options) {
|
|
|
3677
3690
|
repoRootPath,
|
|
3678
3691
|
guidelinePatterns,
|
|
3679
3692
|
guidelinePaths,
|
|
3693
|
+
treatFileSegmentsAsGuidelines,
|
|
3680
3694
|
textParts,
|
|
3681
3695
|
messageType,
|
|
3682
3696
|
verbose
|
|
@@ -3724,16 +3738,20 @@ async function processMessages(options) {
|
|
|
3724
3738
|
}
|
|
3725
3739
|
try {
|
|
3726
3740
|
const fileContent = (await (0, import_promises5.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
3727
|
-
|
|
3728
|
-
|
|
3729
|
-
|
|
3730
|
-
|
|
3731
|
-
|
|
3732
|
-
|
|
3733
|
-
|
|
3734
|
-
|
|
3735
|
-
|
|
3741
|
+
const classifyAsGuideline = shouldTreatAsGuideline({
|
|
3742
|
+
messageType,
|
|
3743
|
+
resolvedPath,
|
|
3744
|
+
repoRootPath,
|
|
3745
|
+
guidelinePatterns,
|
|
3746
|
+
treatFileSegmentsAsGuidelines
|
|
3747
|
+
});
|
|
3748
|
+
if (classifyAsGuideline && guidelinePaths) {
|
|
3749
|
+
guidelinePaths.push(import_node_path5.default.resolve(resolvedPath));
|
|
3750
|
+
if (verbose) {
|
|
3751
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
3752
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
3736
3753
|
}
|
|
3754
|
+
continue;
|
|
3737
3755
|
}
|
|
3738
3756
|
segments.push({
|
|
3739
3757
|
type: "file",
|
|
@@ -3762,6 +3780,26 @@ async function processMessages(options) {
|
|
|
3762
3780
|
}
|
|
3763
3781
|
return segments;
|
|
3764
3782
|
}
|
|
3783
|
+
function shouldTreatAsGuideline(options) {
|
|
3784
|
+
const {
|
|
3785
|
+
messageType,
|
|
3786
|
+
resolvedPath,
|
|
3787
|
+
repoRootPath,
|
|
3788
|
+
guidelinePatterns,
|
|
3789
|
+
treatFileSegmentsAsGuidelines
|
|
3790
|
+
} = options;
|
|
3791
|
+
if (messageType !== "input") {
|
|
3792
|
+
return false;
|
|
3793
|
+
}
|
|
3794
|
+
if (treatFileSegmentsAsGuidelines) {
|
|
3795
|
+
return true;
|
|
3796
|
+
}
|
|
3797
|
+
if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
3798
|
+
return false;
|
|
3799
|
+
}
|
|
3800
|
+
const relativeToRepo = import_node_path5.default.relative(repoRootPath, resolvedPath);
|
|
3801
|
+
return isGuidelineFile(relativeToRepo, guidelinePatterns);
|
|
3802
|
+
}
|
|
3765
3803
|
function asString3(value) {
|
|
3766
3804
|
return typeof value === "string" ? value : void 0;
|
|
3767
3805
|
}
|
|
@@ -4100,6 +4138,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4100
4138
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
4101
4139
|
console.log(` - ${guidelinePath}`);
|
|
4102
4140
|
}
|
|
4141
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
4142
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
4103
4143
|
} else {
|
|
4104
4144
|
console.log(" No guidelines found");
|
|
4105
4145
|
}
|
|
@@ -4469,7 +4509,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4469
4509
|
} else {
|
|
4470
4510
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
4471
4511
|
}
|
|
4472
|
-
const suiteWorkspace =
|
|
4512
|
+
const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
|
|
4473
4513
|
const suiteInputMessages = expandInputShorthand(suite.input);
|
|
4474
4514
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
4475
4515
|
const _globalTarget = asString6(rawGlobalExecution?.target) ?? asString6(suite.target);
|
|
@@ -4505,12 +4545,24 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4505
4545
|
}
|
|
4506
4546
|
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
4507
4547
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
4508
|
-
const
|
|
4548
|
+
const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
|
|
4549
|
+
const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
|
|
4509
4550
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
4510
4551
|
const guidelinePaths = [];
|
|
4511
4552
|
const inputTextParts = [];
|
|
4512
|
-
const
|
|
4513
|
-
messages:
|
|
4553
|
+
const suiteInputSegments = effectiveSuiteInputMessages ? await processMessages({
|
|
4554
|
+
messages: effectiveSuiteInputMessages,
|
|
4555
|
+
searchRoots,
|
|
4556
|
+
repoRootPath,
|
|
4557
|
+
guidelinePatterns,
|
|
4558
|
+
guidelinePaths,
|
|
4559
|
+
treatFileSegmentsAsGuidelines: true,
|
|
4560
|
+
textParts: inputTextParts,
|
|
4561
|
+
messageType: "input",
|
|
4562
|
+
verbose
|
|
4563
|
+
}) : [];
|
|
4564
|
+
const testInputSegments = await processMessages({
|
|
4565
|
+
messages: testInputMessages,
|
|
4514
4566
|
searchRoots,
|
|
4515
4567
|
repoRootPath,
|
|
4516
4568
|
guidelinePatterns,
|
|
@@ -4519,6 +4571,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4519
4571
|
messageType: "input",
|
|
4520
4572
|
verbose
|
|
4521
4573
|
});
|
|
4574
|
+
const inputSegments = [...suiteInputSegments, ...testInputSegments];
|
|
4522
4575
|
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
4523
4576
|
messages: expectedMessages,
|
|
4524
4577
|
searchRoots,
|
|
@@ -4566,7 +4619,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4566
4619
|
...guidelinePaths.map((guidelinePath) => import_node_path8.default.resolve(guidelinePath)),
|
|
4567
4620
|
...userFilePaths
|
|
4568
4621
|
];
|
|
4569
|
-
const caseWorkspace =
|
|
4622
|
+
const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
|
|
4570
4623
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
4571
4624
|
const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
|
|
4572
4625
|
const caseTargets = extractTargetsFromTestCase(evalcase);
|
|
@@ -4597,6 +4650,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4597
4650
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
4598
4651
|
console.log(` - ${guidelinePath}`);
|
|
4599
4652
|
}
|
|
4653
|
+
} else if (!guidelinePatterns || guidelinePatterns.length === 0) {
|
|
4654
|
+
console.log(" No guidelines found (guideline_patterns not configured)");
|
|
4600
4655
|
} else {
|
|
4601
4656
|
console.log(" No guidelines found");
|
|
4602
4657
|
}
|
|
@@ -4685,16 +4740,57 @@ function parseRepoConfig(raw) {
|
|
|
4685
4740
|
...clone !== void 0 && { clone }
|
|
4686
4741
|
};
|
|
4687
4742
|
}
|
|
4688
|
-
function
|
|
4743
|
+
function parseWorkspaceHookConfig(raw, evalFileDir) {
|
|
4689
4744
|
if (!isJsonObject(raw)) return void 0;
|
|
4745
|
+
const script = parseWorkspaceScriptConfig(raw, evalFileDir);
|
|
4690
4746
|
const obj = raw;
|
|
4691
|
-
const
|
|
4692
|
-
const
|
|
4693
|
-
if (!
|
|
4747
|
+
const reset = obj.reset === "none" || obj.reset === "fast" || obj.reset === "strict" ? obj.reset : void 0;
|
|
4748
|
+
const clean = obj.clean === "always" || obj.clean === "on_success" || obj.clean === "on_failure" || obj.clean === "never" ? obj.clean : void 0;
|
|
4749
|
+
if (!script && !reset && !clean) return void 0;
|
|
4694
4750
|
return {
|
|
4695
|
-
...
|
|
4696
|
-
...
|
|
4751
|
+
...script ?? {},
|
|
4752
|
+
...reset !== void 0 && { reset },
|
|
4753
|
+
...clean !== void 0 && { clean }
|
|
4754
|
+
};
|
|
4755
|
+
}
|
|
4756
|
+
function parseWorkspaceHooksConfig(raw, evalFileDir) {
|
|
4757
|
+
if (!isJsonObject(raw)) return void 0;
|
|
4758
|
+
const obj = raw;
|
|
4759
|
+
const beforeAllTests = parseWorkspaceHookConfig(obj.before_all_tests, evalFileDir);
|
|
4760
|
+
const beforeEachTest = parseWorkspaceHookConfig(obj.before_each_test, evalFileDir);
|
|
4761
|
+
const afterEachTest = parseWorkspaceHookConfig(obj.after_each_test, evalFileDir);
|
|
4762
|
+
const afterAllTests = parseWorkspaceHookConfig(obj.after_all_tests, evalFileDir);
|
|
4763
|
+
const onReuse = parseWorkspaceHookConfig(obj.on_reuse, evalFileDir);
|
|
4764
|
+
const onFinish = parseWorkspaceHookConfig(obj.on_finish, evalFileDir);
|
|
4765
|
+
const hooks = {
|
|
4766
|
+
...beforeAllTests !== void 0 && { before_all_tests: beforeAllTests },
|
|
4767
|
+
...beforeEachTest !== void 0 && { before_each_test: beforeEachTest },
|
|
4768
|
+
...afterEachTest !== void 0 && { after_each_test: afterEachTest },
|
|
4769
|
+
...afterAllTests !== void 0 && { after_all_tests: afterAllTests },
|
|
4770
|
+
...onReuse !== void 0 && { on_reuse: onReuse },
|
|
4771
|
+
...onFinish !== void 0 && { on_finish: onFinish }
|
|
4697
4772
|
};
|
|
4773
|
+
return Object.keys(hooks).length > 0 ? hooks : void 0;
|
|
4774
|
+
}
|
|
4775
|
+
async function resolveWorkspaceConfig(raw, evalFileDir) {
|
|
4776
|
+
if (typeof raw === "string") {
|
|
4777
|
+
const workspaceFilePath = import_node_path8.default.resolve(evalFileDir, raw);
|
|
4778
|
+
let content;
|
|
4779
|
+
try {
|
|
4780
|
+
content = await (0, import_promises8.readFile)(workspaceFilePath, "utf8");
|
|
4781
|
+
} catch {
|
|
4782
|
+
throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
|
|
4783
|
+
}
|
|
4784
|
+
const parsed = (0, import_yaml4.parse)(content);
|
|
4785
|
+
if (!isJsonObject(parsed)) {
|
|
4786
|
+
throw new Error(
|
|
4787
|
+
`Invalid workspace file format: ${workspaceFilePath} (expected a YAML object)`
|
|
4788
|
+
);
|
|
4789
|
+
}
|
|
4790
|
+
const workspaceFileDir = import_node_path8.default.dirname(workspaceFilePath);
|
|
4791
|
+
return parseWorkspaceConfig(parsed, workspaceFileDir);
|
|
4792
|
+
}
|
|
4793
|
+
return parseWorkspaceConfig(raw, evalFileDir);
|
|
4698
4794
|
}
|
|
4699
4795
|
function parseWorkspaceConfig(raw, evalFileDir) {
|
|
4700
4796
|
if (!isJsonObject(raw)) return void 0;
|
|
@@ -4705,37 +4801,56 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
4705
4801
|
}
|
|
4706
4802
|
const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
|
|
4707
4803
|
const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
|
|
4708
|
-
const
|
|
4709
|
-
const
|
|
4710
|
-
const
|
|
4711
|
-
const
|
|
4712
|
-
|
|
4713
|
-
if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
|
|
4804
|
+
const hooks = parseWorkspaceHooksConfig(obj.hooks, evalFileDir);
|
|
4805
|
+
const mode = obj.mode === "pooled" || obj.mode === "ephemeral" || obj.mode === "static" ? obj.mode : void 0;
|
|
4806
|
+
const staticPath = typeof obj.static_path === "string" ? obj.static_path : void 0;
|
|
4807
|
+
const pool = typeof obj.pool === "boolean" ? obj.pool : void 0;
|
|
4808
|
+
if (!template && !isolation && !repos && !hooks && !mode && !staticPath && pool === void 0)
|
|
4714
4809
|
return void 0;
|
|
4715
4810
|
return {
|
|
4716
4811
|
...template !== void 0 && { template },
|
|
4717
4812
|
...isolation !== void 0 && { isolation },
|
|
4718
4813
|
...repos !== void 0 && { repos },
|
|
4719
|
-
...
|
|
4720
|
-
...
|
|
4721
|
-
...
|
|
4722
|
-
...
|
|
4723
|
-
...afterEach !== void 0 && { after_each: afterEach }
|
|
4814
|
+
...hooks !== void 0 && { hooks },
|
|
4815
|
+
...mode !== void 0 && { mode },
|
|
4816
|
+
...staticPath !== void 0 && { static_path: staticPath },
|
|
4817
|
+
...pool !== void 0 && { pool }
|
|
4724
4818
|
};
|
|
4725
4819
|
}
|
|
4726
4820
|
function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
4727
4821
|
if (!suiteLevel && !caseLevel) return void 0;
|
|
4728
4822
|
if (!suiteLevel) return caseLevel;
|
|
4729
4823
|
if (!caseLevel) return suiteLevel;
|
|
4824
|
+
const mergeHook = (suiteHook, caseHook) => {
|
|
4825
|
+
if (!suiteHook && !caseHook) return void 0;
|
|
4826
|
+
return {
|
|
4827
|
+
...suiteHook ?? {},
|
|
4828
|
+
...caseHook ?? {}
|
|
4829
|
+
};
|
|
4830
|
+
};
|
|
4831
|
+
const mergedHooks = {
|
|
4832
|
+
before_all_tests: mergeHook(
|
|
4833
|
+
suiteLevel.hooks?.before_all_tests,
|
|
4834
|
+
caseLevel.hooks?.before_all_tests
|
|
4835
|
+
),
|
|
4836
|
+
before_each_test: mergeHook(
|
|
4837
|
+
suiteLevel.hooks?.before_each_test,
|
|
4838
|
+
caseLevel.hooks?.before_each_test
|
|
4839
|
+
),
|
|
4840
|
+
after_each_test: mergeHook(suiteLevel.hooks?.after_each_test, caseLevel.hooks?.after_each_test),
|
|
4841
|
+
after_all_tests: mergeHook(suiteLevel.hooks?.after_all_tests, caseLevel.hooks?.after_all_tests),
|
|
4842
|
+
on_reuse: mergeHook(suiteLevel.hooks?.on_reuse, caseLevel.hooks?.on_reuse),
|
|
4843
|
+
on_finish: mergeHook(suiteLevel.hooks?.on_finish, caseLevel.hooks?.on_finish)
|
|
4844
|
+
};
|
|
4845
|
+
const hasHooks = Object.values(mergedHooks).some((hook) => hook !== void 0);
|
|
4730
4846
|
return {
|
|
4731
4847
|
template: caseLevel.template ?? suiteLevel.template,
|
|
4732
4848
|
isolation: caseLevel.isolation ?? suiteLevel.isolation,
|
|
4733
4849
|
repos: caseLevel.repos ?? suiteLevel.repos,
|
|
4734
|
-
|
|
4735
|
-
|
|
4736
|
-
|
|
4737
|
-
|
|
4738
|
-
after_each: caseLevel.after_each ?? suiteLevel.after_each
|
|
4850
|
+
...hasHooks && { hooks: mergedHooks },
|
|
4851
|
+
mode: caseLevel.mode ?? suiteLevel.mode,
|
|
4852
|
+
static_path: caseLevel.static_path ?? suiteLevel.static_path,
|
|
4853
|
+
pool: caseLevel.pool ?? suiteLevel.pool
|
|
4739
4854
|
};
|
|
4740
4855
|
}
|
|
4741
4856
|
function asString6(value) {
|
|
@@ -9493,8 +9608,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
9493
9608
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
9494
9609
|
if (!parseResult.success) {
|
|
9495
9610
|
const firstError = parseResult.error.errors[0];
|
|
9496
|
-
const
|
|
9497
|
-
const prefix =
|
|
9611
|
+
const path44 = firstError?.path.join(".") || "";
|
|
9612
|
+
const prefix = path44 ? `${target.name} ${path44}: ` : `${target.name}: `;
|
|
9498
9613
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
9499
9614
|
}
|
|
9500
9615
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -10001,15 +10116,15 @@ function getAgentvHome() {
|
|
|
10001
10116
|
function getWorkspacesRoot() {
|
|
10002
10117
|
return import_node_path23.default.join(getAgentvHome(), "workspaces");
|
|
10003
10118
|
}
|
|
10004
|
-
function getGitCacheRoot() {
|
|
10005
|
-
return import_node_path23.default.join(getAgentvHome(), "git-cache");
|
|
10006
|
-
}
|
|
10007
10119
|
function getSubagentsRoot() {
|
|
10008
10120
|
return import_node_path23.default.join(getAgentvHome(), "subagents");
|
|
10009
10121
|
}
|
|
10010
10122
|
function getTraceStateRoot() {
|
|
10011
10123
|
return import_node_path23.default.join(getAgentvHome(), "trace-state");
|
|
10012
10124
|
}
|
|
10125
|
+
function getWorkspacePoolRoot() {
|
|
10126
|
+
return import_node_path23.default.join(getAgentvHome(), "workspace-pool");
|
|
10127
|
+
}
|
|
10013
10128
|
|
|
10014
10129
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
10015
10130
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
@@ -10832,8 +10947,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10832
10947
|
|
|
10833
10948
|
**IMPORTANT**: Follow these exact steps:
|
|
10834
10949
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10835
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
10836
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10837
10950
|
2. When completely finished, run these PowerShell commands to signal completion:
|
|
10838
10951
|
\`\`\`
|
|
10839
10952
|
Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
|
|
@@ -10850,8 +10963,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
|
10850
10963
|
|
|
10851
10964
|
**IMPORTANT**: Follow these exact steps:
|
|
10852
10965
|
1. Create and write your complete response to: {{responseFileTmp}}
|
|
10853
|
-
- All intended file outputs/changes MUST be written in your response file.
|
|
10854
|
-
- For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
|
|
10855
10966
|
2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
|
|
10856
10967
|
3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
|
|
10857
10968
|
`;
|
|
@@ -11464,15 +11575,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
11464
11575
|
});
|
|
11465
11576
|
}
|
|
11466
11577
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
11467
|
-
const { mkdir: mkdir16, readFile:
|
|
11578
|
+
const { mkdir: mkdir16, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
11468
11579
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
11469
|
-
const
|
|
11580
|
+
const path44 = await import("path");
|
|
11470
11581
|
const { randomUUID: randomUUID8 } = await import("crypto");
|
|
11471
|
-
const dir =
|
|
11582
|
+
const dir = path44.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
|
|
11472
11583
|
await mkdir16(dir, { recursive: true });
|
|
11473
|
-
const stdinPath =
|
|
11474
|
-
const stdoutPath =
|
|
11475
|
-
const stderrPath =
|
|
11584
|
+
const stdinPath = path44.join(dir, "stdin.txt");
|
|
11585
|
+
const stdoutPath = path44.join(dir, "stdout.txt");
|
|
11586
|
+
const stderrPath = path44.join(dir, "stderr.txt");
|
|
11476
11587
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
11477
11588
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
11478
11589
|
const { spawn: spawn4 } = await import("child_process");
|
|
@@ -11502,8 +11613,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
11502
11613
|
resolve(code ?? 0);
|
|
11503
11614
|
});
|
|
11504
11615
|
});
|
|
11505
|
-
const stdout = (await
|
|
11506
|
-
const stderr = (await
|
|
11616
|
+
const stdout = (await readFile14(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
11617
|
+
const stderr = (await readFile14(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
11507
11618
|
return { stdout, stderr, exitCode };
|
|
11508
11619
|
} finally {
|
|
11509
11620
|
await rm6(dir, { recursive: true, force: true });
|
|
@@ -11824,7 +11935,7 @@ var CodeEvaluator = class {
|
|
|
11824
11935
|
outputPath,
|
|
11825
11936
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
11826
11937
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
11827
|
-
(
|
|
11938
|
+
(path44) => !context2.evalCase.guideline_paths.includes(path44)
|
|
11828
11939
|
),
|
|
11829
11940
|
input: context2.evalCase.input,
|
|
11830
11941
|
trace: context2.trace ?? null,
|
|
@@ -12103,6 +12214,8 @@ ${context2.fileChanges}`;
|
|
|
12103
12214
|
};
|
|
12104
12215
|
} catch (e) {
|
|
12105
12216
|
const message = e instanceof Error ? e.message : String(e);
|
|
12217
|
+
const evalName = context2.evaluator?.name ?? "llm-judge";
|
|
12218
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
12106
12219
|
return {
|
|
12107
12220
|
score: 0,
|
|
12108
12221
|
verdict: "skip",
|
|
@@ -12131,24 +12244,39 @@ ${context2.fileChanges}`;
|
|
|
12131
12244
|
systemPrompt,
|
|
12132
12245
|
target: judgeProvider.targetName
|
|
12133
12246
|
};
|
|
12134
|
-
|
|
12135
|
-
|
|
12136
|
-
|
|
12137
|
-
|
|
12138
|
-
|
|
12139
|
-
|
|
12140
|
-
|
|
12141
|
-
|
|
12142
|
-
|
|
12143
|
-
|
|
12144
|
-
|
|
12145
|
-
|
|
12146
|
-
|
|
12147
|
-
|
|
12148
|
-
|
|
12149
|
-
|
|
12150
|
-
|
|
12151
|
-
|
|
12247
|
+
try {
|
|
12248
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
12249
|
+
context: context2,
|
|
12250
|
+
judgeProvider,
|
|
12251
|
+
systemPrompt,
|
|
12252
|
+
userPrompt: prompt,
|
|
12253
|
+
schema: rubricEvaluationSchema
|
|
12254
|
+
});
|
|
12255
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
12256
|
+
return {
|
|
12257
|
+
score,
|
|
12258
|
+
verdict,
|
|
12259
|
+
hits,
|
|
12260
|
+
misses,
|
|
12261
|
+
expectedAspectCount: rubrics.length,
|
|
12262
|
+
reasoning: data.overall_reasoning,
|
|
12263
|
+
evaluatorRawRequest,
|
|
12264
|
+
tokenUsage
|
|
12265
|
+
};
|
|
12266
|
+
} catch (e) {
|
|
12267
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
12268
|
+
const evalName = context2.evaluator?.name ?? "llm-judge";
|
|
12269
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
12270
|
+
return {
|
|
12271
|
+
score: 0,
|
|
12272
|
+
verdict: "skip",
|
|
12273
|
+
hits: [],
|
|
12274
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
12275
|
+
expectedAspectCount: rubrics.length,
|
|
12276
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
12277
|
+
evaluatorRawRequest
|
|
12278
|
+
};
|
|
12279
|
+
}
|
|
12152
12280
|
}
|
|
12153
12281
|
/**
|
|
12154
12282
|
* Evaluate using score-range rubrics (analytic rubric scoring).
|
|
@@ -12162,25 +12290,40 @@ ${context2.fileChanges}`;
|
|
|
12162
12290
|
systemPrompt,
|
|
12163
12291
|
target: judgeProvider.targetName
|
|
12164
12292
|
};
|
|
12165
|
-
|
|
12166
|
-
|
|
12167
|
-
|
|
12168
|
-
|
|
12169
|
-
|
|
12170
|
-
|
|
12171
|
-
|
|
12172
|
-
|
|
12173
|
-
|
|
12174
|
-
|
|
12175
|
-
|
|
12176
|
-
|
|
12177
|
-
|
|
12178
|
-
|
|
12179
|
-
|
|
12180
|
-
|
|
12181
|
-
|
|
12182
|
-
|
|
12183
|
-
|
|
12293
|
+
try {
|
|
12294
|
+
const { data, tokenUsage } = await this.runWithRetry({
|
|
12295
|
+
context: context2,
|
|
12296
|
+
judgeProvider,
|
|
12297
|
+
systemPrompt,
|
|
12298
|
+
userPrompt: prompt,
|
|
12299
|
+
schema: scoreRangeEvaluationSchema
|
|
12300
|
+
});
|
|
12301
|
+
const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
|
|
12302
|
+
return {
|
|
12303
|
+
score,
|
|
12304
|
+
verdict,
|
|
12305
|
+
hits,
|
|
12306
|
+
misses,
|
|
12307
|
+
expectedAspectCount: rubrics.length,
|
|
12308
|
+
reasoning: data.overall_reasoning,
|
|
12309
|
+
evaluatorRawRequest,
|
|
12310
|
+
details,
|
|
12311
|
+
tokenUsage
|
|
12312
|
+
};
|
|
12313
|
+
} catch (e) {
|
|
12314
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
12315
|
+
const evalName = context2.evaluator?.name ?? "llm-judge";
|
|
12316
|
+
console.warn(`\u26A0 LLM judge "${evalName}" failed after 3 attempts (${message}) \u2014 skipped`);
|
|
12317
|
+
return {
|
|
12318
|
+
score: 0,
|
|
12319
|
+
verdict: "skip",
|
|
12320
|
+
hits: [],
|
|
12321
|
+
misses: [`Judge parse failure after 3 attempts: ${message}`],
|
|
12322
|
+
expectedAspectCount: rubrics.length,
|
|
12323
|
+
reasoning: `Judge parse failure after 3 attempts: ${message}`,
|
|
12324
|
+
evaluatorRawRequest
|
|
12325
|
+
};
|
|
12326
|
+
}
|
|
12184
12327
|
}
|
|
12185
12328
|
/**
|
|
12186
12329
|
* Build prompt for score-range rubric evaluation.
|
|
@@ -12466,19 +12609,13 @@ var CompositeEvaluator = class {
|
|
|
12466
12609
|
runWeightedAverage(results, weights) {
|
|
12467
12610
|
let totalWeight = 0;
|
|
12468
12611
|
let weightedSum = 0;
|
|
12612
|
+
let evaluatedCount = 0;
|
|
12469
12613
|
const allHits = [];
|
|
12470
12614
|
const allMisses = [];
|
|
12471
12615
|
const reasoningParts = [];
|
|
12472
12616
|
const scores = [];
|
|
12473
12617
|
for (const member of results) {
|
|
12474
12618
|
const weight = weights?.[member.id] ?? 1;
|
|
12475
|
-
totalWeight += weight;
|
|
12476
|
-
weightedSum += member.result.score * weight;
|
|
12477
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12478
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12479
|
-
if (member.result.reasoning) {
|
|
12480
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12481
|
-
}
|
|
12482
12619
|
scores.push({
|
|
12483
12620
|
name: member.id,
|
|
12484
12621
|
type: member.type,
|
|
@@ -12493,6 +12630,32 @@ var CompositeEvaluator = class {
|
|
|
12493
12630
|
details: member.result.details,
|
|
12494
12631
|
tokenUsage: member.result.tokenUsage
|
|
12495
12632
|
});
|
|
12633
|
+
if (member.result.verdict === "skip") {
|
|
12634
|
+
continue;
|
|
12635
|
+
}
|
|
12636
|
+
evaluatedCount++;
|
|
12637
|
+
totalWeight += weight;
|
|
12638
|
+
weightedSum += member.result.score * weight;
|
|
12639
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12640
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12641
|
+
if (member.result.reasoning) {
|
|
12642
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12643
|
+
}
|
|
12644
|
+
}
|
|
12645
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
12646
|
+
return {
|
|
12647
|
+
score: 0,
|
|
12648
|
+
verdict: "skip",
|
|
12649
|
+
hits: [],
|
|
12650
|
+
misses: [],
|
|
12651
|
+
expectedAspectCount: 1,
|
|
12652
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
12653
|
+
evaluatorRawRequest: {
|
|
12654
|
+
aggregator: "weighted_average",
|
|
12655
|
+
...weights ? { weights } : {}
|
|
12656
|
+
},
|
|
12657
|
+
scores
|
|
12658
|
+
};
|
|
12496
12659
|
}
|
|
12497
12660
|
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
12498
12661
|
return {
|
|
@@ -12516,19 +12679,8 @@ var CompositeEvaluator = class {
|
|
|
12516
12679
|
const reasoningParts = [];
|
|
12517
12680
|
let passingCount = 0;
|
|
12518
12681
|
let borderlineCount = 0;
|
|
12682
|
+
let evaluatedCount = 0;
|
|
12519
12683
|
for (const member of results) {
|
|
12520
|
-
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
12521
|
-
if (isPassing) {
|
|
12522
|
-
passingCount++;
|
|
12523
|
-
if (member.result.verdict === "borderline") {
|
|
12524
|
-
borderlineCount++;
|
|
12525
|
-
}
|
|
12526
|
-
}
|
|
12527
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12528
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12529
|
-
if (member.result.reasoning) {
|
|
12530
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12531
|
-
}
|
|
12532
12684
|
scores.push({
|
|
12533
12685
|
name: member.id,
|
|
12534
12686
|
type: member.type,
|
|
@@ -12542,8 +12694,39 @@ var CompositeEvaluator = class {
|
|
|
12542
12694
|
details: member.result.details,
|
|
12543
12695
|
tokenUsage: member.result.tokenUsage
|
|
12544
12696
|
});
|
|
12697
|
+
if (member.result.verdict === "skip") {
|
|
12698
|
+
continue;
|
|
12699
|
+
}
|
|
12700
|
+
evaluatedCount++;
|
|
12701
|
+
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
12702
|
+
if (isPassing) {
|
|
12703
|
+
passingCount++;
|
|
12704
|
+
if (member.result.verdict === "borderline") {
|
|
12705
|
+
borderlineCount++;
|
|
12706
|
+
}
|
|
12707
|
+
}
|
|
12708
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
12709
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
12710
|
+
if (member.result.reasoning) {
|
|
12711
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
12712
|
+
}
|
|
12713
|
+
}
|
|
12714
|
+
if (evaluatedCount === 0 && results.length > 0) {
|
|
12715
|
+
return {
|
|
12716
|
+
score: 0,
|
|
12717
|
+
verdict: "skip",
|
|
12718
|
+
hits: [],
|
|
12719
|
+
misses: [],
|
|
12720
|
+
expectedAspectCount: 1,
|
|
12721
|
+
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
12722
|
+
evaluatorRawRequest: {
|
|
12723
|
+
aggregator: "threshold",
|
|
12724
|
+
threshold
|
|
12725
|
+
},
|
|
12726
|
+
scores
|
|
12727
|
+
};
|
|
12545
12728
|
}
|
|
12546
|
-
const totalCount =
|
|
12729
|
+
const totalCount = evaluatedCount;
|
|
12547
12730
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
12548
12731
|
const pass = score >= threshold;
|
|
12549
12732
|
if (pass && borderlineCount > 0) {
|
|
@@ -13051,115 +13234,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
13051
13234
|
* Evaluate a single field against the expected value.
|
|
13052
13235
|
*/
|
|
13053
13236
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
13054
|
-
const { path:
|
|
13055
|
-
const candidateValue = resolvePath(candidateData,
|
|
13056
|
-
const expectedValue = resolvePath(expectedData,
|
|
13237
|
+
const { path: path44, match, required = true, weight = 1 } = fieldConfig;
|
|
13238
|
+
const candidateValue = resolvePath(candidateData, path44);
|
|
13239
|
+
const expectedValue = resolvePath(expectedData, path44);
|
|
13057
13240
|
if (expectedValue === void 0) {
|
|
13058
13241
|
return {
|
|
13059
|
-
path:
|
|
13242
|
+
path: path44,
|
|
13060
13243
|
score: 1,
|
|
13061
13244
|
// No expected value means no comparison needed
|
|
13062
13245
|
weight,
|
|
13063
13246
|
hit: true,
|
|
13064
|
-
message: `${
|
|
13247
|
+
message: `${path44}: no expected value`
|
|
13065
13248
|
};
|
|
13066
13249
|
}
|
|
13067
13250
|
if (candidateValue === void 0) {
|
|
13068
13251
|
if (required) {
|
|
13069
13252
|
return {
|
|
13070
|
-
path:
|
|
13253
|
+
path: path44,
|
|
13071
13254
|
score: 0,
|
|
13072
13255
|
weight,
|
|
13073
13256
|
hit: false,
|
|
13074
|
-
message: `${
|
|
13257
|
+
message: `${path44} (required, missing)`
|
|
13075
13258
|
};
|
|
13076
13259
|
}
|
|
13077
13260
|
return {
|
|
13078
|
-
path:
|
|
13261
|
+
path: path44,
|
|
13079
13262
|
score: 1,
|
|
13080
13263
|
// Don't penalize missing optional fields
|
|
13081
13264
|
weight: 0,
|
|
13082
13265
|
// Zero weight means it won't affect the score
|
|
13083
13266
|
hit: true,
|
|
13084
|
-
message: `${
|
|
13267
|
+
message: `${path44}: optional field missing`
|
|
13085
13268
|
};
|
|
13086
13269
|
}
|
|
13087
13270
|
switch (match) {
|
|
13088
13271
|
case "exact":
|
|
13089
|
-
return this.compareExact(
|
|
13272
|
+
return this.compareExact(path44, candidateValue, expectedValue, weight);
|
|
13090
13273
|
case "numeric_tolerance":
|
|
13091
13274
|
return this.compareNumericTolerance(
|
|
13092
|
-
|
|
13275
|
+
path44,
|
|
13093
13276
|
candidateValue,
|
|
13094
13277
|
expectedValue,
|
|
13095
13278
|
fieldConfig,
|
|
13096
13279
|
weight
|
|
13097
13280
|
);
|
|
13098
13281
|
case "date":
|
|
13099
|
-
return this.compareDate(
|
|
13282
|
+
return this.compareDate(path44, candidateValue, expectedValue, fieldConfig, weight);
|
|
13100
13283
|
default:
|
|
13101
13284
|
return {
|
|
13102
|
-
path:
|
|
13285
|
+
path: path44,
|
|
13103
13286
|
score: 0,
|
|
13104
13287
|
weight,
|
|
13105
13288
|
hit: false,
|
|
13106
|
-
message: `${
|
|
13289
|
+
message: `${path44}: unknown match type "${match}"`
|
|
13107
13290
|
};
|
|
13108
13291
|
}
|
|
13109
13292
|
}
|
|
13110
13293
|
/**
|
|
13111
13294
|
* Exact equality comparison.
|
|
13112
13295
|
*/
|
|
13113
|
-
compareExact(
|
|
13296
|
+
compareExact(path44, candidateValue, expectedValue, weight) {
|
|
13114
13297
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
13115
13298
|
return {
|
|
13116
|
-
path:
|
|
13299
|
+
path: path44,
|
|
13117
13300
|
score: 1,
|
|
13118
13301
|
weight,
|
|
13119
13302
|
hit: true,
|
|
13120
|
-
message:
|
|
13303
|
+
message: path44
|
|
13121
13304
|
};
|
|
13122
13305
|
}
|
|
13123
13306
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
13124
13307
|
return {
|
|
13125
|
-
path:
|
|
13308
|
+
path: path44,
|
|
13126
13309
|
score: 0,
|
|
13127
13310
|
weight,
|
|
13128
13311
|
hit: false,
|
|
13129
|
-
message: `${
|
|
13312
|
+
message: `${path44} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
13130
13313
|
};
|
|
13131
13314
|
}
|
|
13132
13315
|
return {
|
|
13133
|
-
path:
|
|
13316
|
+
path: path44,
|
|
13134
13317
|
score: 0,
|
|
13135
13318
|
weight,
|
|
13136
13319
|
hit: false,
|
|
13137
|
-
message: `${
|
|
13320
|
+
message: `${path44} (value mismatch)`
|
|
13138
13321
|
};
|
|
13139
13322
|
}
|
|
13140
13323
|
/**
|
|
13141
13324
|
* Numeric comparison with absolute or relative tolerance.
|
|
13142
13325
|
*/
|
|
13143
|
-
compareNumericTolerance(
|
|
13326
|
+
compareNumericTolerance(path44, candidateValue, expectedValue, fieldConfig, weight) {
|
|
13144
13327
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
13145
13328
|
const candidateNum = toNumber2(candidateValue);
|
|
13146
13329
|
const expectedNum = toNumber2(expectedValue);
|
|
13147
13330
|
if (candidateNum === null || expectedNum === null) {
|
|
13148
13331
|
return {
|
|
13149
|
-
path:
|
|
13332
|
+
path: path44,
|
|
13150
13333
|
score: 0,
|
|
13151
13334
|
weight,
|
|
13152
13335
|
hit: false,
|
|
13153
|
-
message: `${
|
|
13336
|
+
message: `${path44} (non-numeric value)`
|
|
13154
13337
|
};
|
|
13155
13338
|
}
|
|
13156
13339
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
13157
13340
|
return {
|
|
13158
|
-
path:
|
|
13341
|
+
path: path44,
|
|
13159
13342
|
score: 0,
|
|
13160
13343
|
weight,
|
|
13161
13344
|
hit: false,
|
|
13162
|
-
message: `${
|
|
13345
|
+
message: `${path44} (invalid numeric value)`
|
|
13163
13346
|
};
|
|
13164
13347
|
}
|
|
13165
13348
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -13172,61 +13355,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
13172
13355
|
}
|
|
13173
13356
|
if (withinTolerance) {
|
|
13174
13357
|
return {
|
|
13175
|
-
path:
|
|
13358
|
+
path: path44,
|
|
13176
13359
|
score: 1,
|
|
13177
13360
|
weight,
|
|
13178
13361
|
hit: true,
|
|
13179
|
-
message: `${
|
|
13362
|
+
message: `${path44} (within tolerance: diff=${diff.toFixed(2)})`
|
|
13180
13363
|
};
|
|
13181
13364
|
}
|
|
13182
13365
|
return {
|
|
13183
|
-
path:
|
|
13366
|
+
path: path44,
|
|
13184
13367
|
score: 0,
|
|
13185
13368
|
weight,
|
|
13186
13369
|
hit: false,
|
|
13187
|
-
message: `${
|
|
13370
|
+
message: `${path44} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
13188
13371
|
};
|
|
13189
13372
|
}
|
|
13190
13373
|
/**
|
|
13191
13374
|
* Date comparison with format normalization.
|
|
13192
13375
|
*/
|
|
13193
|
-
compareDate(
|
|
13376
|
+
compareDate(path44, candidateValue, expectedValue, fieldConfig, weight) {
|
|
13194
13377
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
13195
13378
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
13196
13379
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
13197
13380
|
if (candidateDate === null) {
|
|
13198
13381
|
return {
|
|
13199
|
-
path:
|
|
13382
|
+
path: path44,
|
|
13200
13383
|
score: 0,
|
|
13201
13384
|
weight,
|
|
13202
13385
|
hit: false,
|
|
13203
|
-
message: `${
|
|
13386
|
+
message: `${path44} (unparseable candidate date)`
|
|
13204
13387
|
};
|
|
13205
13388
|
}
|
|
13206
13389
|
if (expectedDate === null) {
|
|
13207
13390
|
return {
|
|
13208
|
-
path:
|
|
13391
|
+
path: path44,
|
|
13209
13392
|
score: 0,
|
|
13210
13393
|
weight,
|
|
13211
13394
|
hit: false,
|
|
13212
|
-
message: `${
|
|
13395
|
+
message: `${path44} (unparseable expected date)`
|
|
13213
13396
|
};
|
|
13214
13397
|
}
|
|
13215
13398
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
13216
13399
|
return {
|
|
13217
|
-
path:
|
|
13400
|
+
path: path44,
|
|
13218
13401
|
score: 1,
|
|
13219
13402
|
weight,
|
|
13220
13403
|
hit: true,
|
|
13221
|
-
message:
|
|
13404
|
+
message: path44
|
|
13222
13405
|
};
|
|
13223
13406
|
}
|
|
13224
13407
|
return {
|
|
13225
|
-
path:
|
|
13408
|
+
path: path44,
|
|
13226
13409
|
score: 0,
|
|
13227
13410
|
weight,
|
|
13228
13411
|
hit: false,
|
|
13229
|
-
message: `${
|
|
13412
|
+
message: `${path44} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
13230
13413
|
};
|
|
13231
13414
|
}
|
|
13232
13415
|
/**
|
|
@@ -13267,11 +13450,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
13267
13450
|
};
|
|
13268
13451
|
}
|
|
13269
13452
|
};
|
|
13270
|
-
function resolvePath(obj,
|
|
13271
|
-
if (!
|
|
13453
|
+
function resolvePath(obj, path44) {
|
|
13454
|
+
if (!path44 || !obj) {
|
|
13272
13455
|
return void 0;
|
|
13273
13456
|
}
|
|
13274
|
-
const parts =
|
|
13457
|
+
const parts = path44.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
13275
13458
|
let current = obj;
|
|
13276
13459
|
for (const part of parts) {
|
|
13277
13460
|
if (current === null || current === void 0) {
|
|
@@ -14089,8 +14272,8 @@ var TokenUsageEvaluator = class {
|
|
|
14089
14272
|
};
|
|
14090
14273
|
|
|
14091
14274
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
14092
|
-
function getNestedValue(obj,
|
|
14093
|
-
const parts =
|
|
14275
|
+
function getNestedValue(obj, path44) {
|
|
14276
|
+
const parts = path44.split(".");
|
|
14094
14277
|
let current = obj;
|
|
14095
14278
|
for (const part of parts) {
|
|
14096
14279
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -14653,7 +14836,7 @@ function runEqualsAssertion(output, value) {
|
|
|
14653
14836
|
// src/evaluation/orchestrator.ts
|
|
14654
14837
|
var import_node_crypto9 = require("crypto");
|
|
14655
14838
|
var import_promises29 = require("fs/promises");
|
|
14656
|
-
var
|
|
14839
|
+
var import_node_path42 = __toESM(require("path"), 1);
|
|
14657
14840
|
var import_micromatch4 = __toESM(require("micromatch"), 1);
|
|
14658
14841
|
|
|
14659
14842
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -15523,7 +15706,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
15523
15706
|
}
|
|
15524
15707
|
}
|
|
15525
15708
|
|
|
15526
|
-
// src/evaluation/workspace/
|
|
15709
|
+
// src/evaluation/workspace/pool-manager.ts
|
|
15527
15710
|
var import_node_child_process7 = require("child_process");
|
|
15528
15711
|
var import_node_crypto8 = require("crypto");
|
|
15529
15712
|
var import_node_fs11 = require("fs");
|
|
@@ -15531,8 +15714,6 @@ var import_promises27 = require("fs/promises");
|
|
|
15531
15714
|
var import_node_path39 = __toESM(require("path"), 1);
|
|
15532
15715
|
var import_node_util5 = require("util");
|
|
15533
15716
|
var execFileAsync = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
|
|
15534
|
-
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
15535
|
-
var LOCK_TIMEOUT_MS = 6e4;
|
|
15536
15717
|
function gitEnv() {
|
|
15537
15718
|
const env = { ...process.env };
|
|
15538
15719
|
for (const key of Object.keys(env)) {
|
|
@@ -15547,160 +15728,326 @@ function gitEnv() {
|
|
|
15547
15728
|
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
15548
15729
|
};
|
|
15549
15730
|
}
|
|
15550
|
-
function cacheKey(source) {
|
|
15551
|
-
const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
|
|
15552
|
-
return (0, import_node_crypto8.createHash)("sha256").update(raw).digest("hex");
|
|
15553
|
-
}
|
|
15554
|
-
function getSourceUrl(source) {
|
|
15555
|
-
return source.type === "git" ? source.url : source.path;
|
|
15556
|
-
}
|
|
15557
15731
|
async function git(args, opts) {
|
|
15558
15732
|
const { stdout } = await execFileAsync("git", args, {
|
|
15559
15733
|
cwd: opts?.cwd,
|
|
15560
|
-
timeout: opts?.timeout ??
|
|
15734
|
+
timeout: opts?.timeout ?? 3e5,
|
|
15561
15735
|
env: gitEnv(),
|
|
15562
15736
|
maxBuffer: 50 * 1024 * 1024
|
|
15563
|
-
// 50MB
|
|
15564
15737
|
});
|
|
15565
15738
|
return stdout.trim();
|
|
15566
15739
|
}
|
|
15567
|
-
|
|
15568
|
-
const
|
|
15569
|
-
|
|
15570
|
-
|
|
15571
|
-
|
|
15572
|
-
|
|
15573
|
-
|
|
15574
|
-
|
|
15575
|
-
|
|
15740
|
+
function normalizeRepoForFingerprint(repo) {
|
|
15741
|
+
const source = repo.source.type === "git" ? { type: "git", url: repo.source.url.toLowerCase().replace(/\.git$/, "") } : { type: "local", path: repo.source.path };
|
|
15742
|
+
const result = {
|
|
15743
|
+
path: repo.path,
|
|
15744
|
+
source,
|
|
15745
|
+
ref: repo.checkout?.ref ?? "HEAD"
|
|
15746
|
+
};
|
|
15747
|
+
if (repo.clone?.depth !== void 0) {
|
|
15748
|
+
result.depth = repo.clone.depth;
|
|
15749
|
+
}
|
|
15750
|
+
if (repo.clone?.filter !== void 0) {
|
|
15751
|
+
result.filter = repo.clone.filter;
|
|
15752
|
+
}
|
|
15753
|
+
if (repo.clone?.sparse?.length) {
|
|
15754
|
+
result.sparse = [...repo.clone.sparse].sort();
|
|
15755
|
+
}
|
|
15756
|
+
return result;
|
|
15757
|
+
}
|
|
15758
|
+
function computeWorkspaceFingerprint(templatePath, repos) {
|
|
15759
|
+
const canonical = {
|
|
15760
|
+
templatePath: templatePath ?? null,
|
|
15761
|
+
repos: [...repos].sort((a, b) => a.path.localeCompare(b.path)).map(normalizeRepoForFingerprint)
|
|
15762
|
+
};
|
|
15763
|
+
return (0, import_node_crypto8.createHash)("sha256").update(JSON.stringify(canonical)).digest("hex");
|
|
15764
|
+
}
|
|
15765
|
+
async function copyDirectoryRecursive2(src, dest, skipDirs) {
|
|
15766
|
+
await (0, import_promises27.mkdir)(dest, { recursive: true });
|
|
15767
|
+
const entries = await (0, import_promises27.readdir)(src, { withFileTypes: true });
|
|
15768
|
+
for (const entry of entries) {
|
|
15769
|
+
const srcPath = import_node_path39.default.join(src, entry.name);
|
|
15770
|
+
const destPath = import_node_path39.default.join(dest, entry.name);
|
|
15771
|
+
if (entry.name === ".git") {
|
|
15772
|
+
continue;
|
|
15773
|
+
}
|
|
15774
|
+
if (entry.isDirectory()) {
|
|
15775
|
+
if (skipDirs?.has(entry.name)) {
|
|
15576
15776
|
continue;
|
|
15577
15777
|
}
|
|
15578
|
-
|
|
15778
|
+
await copyDirectoryRecursive2(srcPath, destPath, skipDirs);
|
|
15779
|
+
} else {
|
|
15780
|
+
await (0, import_promises27.cp)(srcPath, destPath, { preserveTimestamps: true, force: true });
|
|
15579
15781
|
}
|
|
15580
15782
|
}
|
|
15581
|
-
throw new Error(`Timed out waiting for lock: ${lockPath}`);
|
|
15582
15783
|
}
|
|
15583
|
-
|
|
15584
|
-
|
|
15585
|
-
|
|
15586
|
-
|
|
15587
|
-
}
|
|
15588
|
-
}
|
|
15589
|
-
var RepoManager = class {
|
|
15590
|
-
cacheDir;
|
|
15591
|
-
verbose;
|
|
15592
|
-
constructor(cacheDir, verbose = false) {
|
|
15593
|
-
this.cacheDir = cacheDir ?? getGitCacheRoot();
|
|
15594
|
-
this.verbose = verbose;
|
|
15784
|
+
var WorkspacePoolManager = class {
|
|
15785
|
+
poolRoot;
|
|
15786
|
+
constructor(poolRoot) {
|
|
15787
|
+
this.poolRoot = poolRoot ?? getWorkspacePoolRoot();
|
|
15595
15788
|
}
|
|
15596
|
-
|
|
15597
|
-
|
|
15598
|
-
|
|
15599
|
-
|
|
15600
|
-
|
|
15789
|
+
/**
|
|
15790
|
+
* Acquire a workspace slot from the pool.
|
|
15791
|
+
*
|
|
15792
|
+
* 1. Compute fingerprint from template + repos
|
|
15793
|
+
* 2. Check drift (compare stored metadata.json fingerprint vs computed)
|
|
15794
|
+
* 3. If drift: warn, remove all slots, rematerialize
|
|
15795
|
+
* 4. Acquire a slot (try-lock slot-0, slot-1, ..., up to maxSlots)
|
|
15796
|
+
* 5. If slot exists: reset repos, re-copy template files (skip repo directories)
|
|
15797
|
+
* 6. If new slot: copy template, materialize all repos, write metadata.json
|
|
15798
|
+
* 7. Return the slot (with path, index, isExisting)
|
|
15799
|
+
*/
|
|
15800
|
+
async acquireWorkspace(options) {
|
|
15801
|
+
const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
|
|
15802
|
+
const fingerprint = computeWorkspaceFingerprint(templatePath, repos);
|
|
15803
|
+
const poolDir = import_node_path39.default.join(this.poolRoot, fingerprint);
|
|
15804
|
+
await (0, import_promises27.mkdir)(poolDir, { recursive: true });
|
|
15805
|
+
const drifted = await this.checkDrift(poolDir, fingerprint);
|
|
15806
|
+
if (drifted) {
|
|
15807
|
+
console.warn(
|
|
15808
|
+
`[workspace-pool] Drift detected for fingerprint ${fingerprint.slice(0, 12)}... Removing stale slots.`
|
|
15601
15809
|
);
|
|
15810
|
+
await this.removeAllSlots(poolDir);
|
|
15602
15811
|
}
|
|
15603
|
-
|
|
15604
|
-
const
|
|
15605
|
-
|
|
15606
|
-
|
|
15607
|
-
|
|
15608
|
-
|
|
15812
|
+
for (let i = 0; i < maxSlots; i++) {
|
|
15813
|
+
const slotPath = import_node_path39.default.join(poolDir, `slot-${i}`);
|
|
15814
|
+
const lockPath = `${slotPath}.lock`;
|
|
15815
|
+
const locked = await this.tryLock(lockPath);
|
|
15816
|
+
if (!locked) {
|
|
15817
|
+
continue;
|
|
15609
15818
|
}
|
|
15610
|
-
|
|
15611
|
-
|
|
15612
|
-
|
|
15613
|
-
|
|
15614
|
-
|
|
15615
|
-
|
|
15616
|
-
|
|
15819
|
+
const slotExists = (0, import_node_fs11.existsSync)(slotPath);
|
|
15820
|
+
if (slotExists) {
|
|
15821
|
+
await this.resetSlot(slotPath, templatePath, repos, poolReset);
|
|
15822
|
+
return {
|
|
15823
|
+
index: i,
|
|
15824
|
+
path: slotPath,
|
|
15825
|
+
isExisting: true,
|
|
15826
|
+
lockPath,
|
|
15827
|
+
fingerprint,
|
|
15828
|
+
poolDir
|
|
15829
|
+
};
|
|
15617
15830
|
}
|
|
15618
|
-
|
|
15831
|
+
await (0, import_promises27.mkdir)(slotPath, { recursive: true });
|
|
15832
|
+
if (templatePath) {
|
|
15833
|
+
await copyDirectoryRecursive2(templatePath, slotPath);
|
|
15834
|
+
}
|
|
15835
|
+
if (repos.length > 0) {
|
|
15836
|
+
await repoManager.materializeAll(repos, slotPath);
|
|
15837
|
+
}
|
|
15838
|
+
await this.writeMetadata(poolDir, fingerprint, templatePath ?? null, repos);
|
|
15839
|
+
return {
|
|
15840
|
+
index: i,
|
|
15841
|
+
path: slotPath,
|
|
15842
|
+
isExisting: false,
|
|
15843
|
+
lockPath,
|
|
15844
|
+
fingerprint,
|
|
15845
|
+
poolDir
|
|
15846
|
+
};
|
|
15847
|
+
}
|
|
15848
|
+
throw new Error(
|
|
15849
|
+
`All ${maxSlots} pool slots are locked for fingerprint ${fingerprint.slice(0, 12)}...`
|
|
15850
|
+
);
|
|
15851
|
+
}
|
|
15852
|
+
/** Remove lock file to release a slot. */
|
|
15853
|
+
async releaseSlot(slot) {
|
|
15854
|
+
try {
|
|
15855
|
+
await (0, import_promises27.unlink)(slot.lockPath);
|
|
15856
|
+
} catch {
|
|
15619
15857
|
}
|
|
15620
15858
|
}
|
|
15621
15859
|
/**
|
|
15622
|
-
*
|
|
15623
|
-
*
|
|
15624
|
-
* Returns
|
|
15860
|
+
* Try to acquire a PID-based lock file.
|
|
15861
|
+
* On EEXIST, read PID and check if process is alive. If dead, stale lock — remove and retry.
|
|
15862
|
+
* Returns true if lock acquired, false if slot is actively locked.
|
|
15863
|
+
* Uses a bounded loop (max 3 attempts) to avoid unbounded recursion.
|
|
15625
15864
|
*/
|
|
15626
|
-
async
|
|
15627
|
-
|
|
15628
|
-
|
|
15629
|
-
|
|
15630
|
-
|
|
15631
|
-
|
|
15632
|
-
|
|
15633
|
-
|
|
15634
|
-
|
|
15865
|
+
async tryLock(lockPath) {
|
|
15866
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
15867
|
+
try {
|
|
15868
|
+
await (0, import_promises27.writeFile)(lockPath, String(process.pid), { flag: "wx" });
|
|
15869
|
+
return true;
|
|
15870
|
+
} catch (err) {
|
|
15871
|
+
if (err.code !== "EEXIST") {
|
|
15872
|
+
throw err;
|
|
15873
|
+
}
|
|
15874
|
+
try {
|
|
15875
|
+
const pidStr = await (0, import_promises27.readFile)(lockPath, "utf-8");
|
|
15876
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
15877
|
+
if (!Number.isNaN(pid)) {
|
|
15878
|
+
try {
|
|
15879
|
+
process.kill(pid, 0);
|
|
15880
|
+
return false;
|
|
15881
|
+
} catch {
|
|
15882
|
+
await (0, import_promises27.unlink)(lockPath).catch(() => {
|
|
15883
|
+
});
|
|
15884
|
+
continue;
|
|
15885
|
+
}
|
|
15886
|
+
}
|
|
15887
|
+
} catch {
|
|
15888
|
+
}
|
|
15889
|
+
return false;
|
|
15890
|
+
}
|
|
15891
|
+
}
|
|
15892
|
+
return false;
|
|
15893
|
+
}
|
|
15894
|
+
/**
|
|
15895
|
+
* Check if the stored fingerprint in metadata.json differs from the computed one.
|
|
15896
|
+
* Returns true if drifted, false otherwise.
|
|
15897
|
+
* Returns false (no drift) if metadata.json doesn't exist (first use).
|
|
15898
|
+
*/
|
|
15899
|
+
async checkDrift(poolDir, fingerprint) {
|
|
15900
|
+
const metadataPath = import_node_path39.default.join(poolDir, "metadata.json");
|
|
15901
|
+
try {
|
|
15902
|
+
const raw = await (0, import_promises27.readFile)(metadataPath, "utf-8");
|
|
15903
|
+
const metadata = JSON.parse(raw);
|
|
15904
|
+
return metadata.fingerprint !== fingerprint;
|
|
15905
|
+
} catch {
|
|
15906
|
+
return false;
|
|
15635
15907
|
}
|
|
15636
|
-
|
|
15637
|
-
|
|
15638
|
-
|
|
15639
|
-
|
|
15908
|
+
}
|
|
15909
|
+
/** Write metadata.json with fingerprint, inputs, and timestamp. */
|
|
15910
|
+
async writeMetadata(poolDir, fingerprint, templatePath, repos) {
|
|
15911
|
+
const metadata = {
|
|
15912
|
+
fingerprint,
|
|
15913
|
+
templatePath,
|
|
15914
|
+
repos,
|
|
15915
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
15916
|
+
};
|
|
15917
|
+
await (0, import_promises27.writeFile)(import_node_path39.default.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
|
|
15918
|
+
}
|
|
15919
|
+
/** Remove all slot directories and their lock files from a pool directory. */
|
|
15920
|
+
async removeAllSlots(poolDir) {
|
|
15921
|
+
const entries = await (0, import_promises27.readdir)(poolDir);
|
|
15922
|
+
for (const entry of entries) {
|
|
15923
|
+
if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
|
|
15924
|
+
const lockPath = import_node_path39.default.join(poolDir, `${entry}.lock`);
|
|
15925
|
+
if ((0, import_node_fs11.existsSync)(lockPath)) {
|
|
15926
|
+
try {
|
|
15927
|
+
const pidStr = await (0, import_promises27.readFile)(lockPath, "utf-8");
|
|
15928
|
+
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
15929
|
+
if (!Number.isNaN(pid)) {
|
|
15930
|
+
try {
|
|
15931
|
+
process.kill(pid, 0);
|
|
15932
|
+
console.warn(`[workspace-pool] Skipping slot ${entry}: locked by PID ${pid}`);
|
|
15933
|
+
continue;
|
|
15934
|
+
} catch {
|
|
15935
|
+
}
|
|
15936
|
+
}
|
|
15937
|
+
} catch {
|
|
15938
|
+
}
|
|
15640
15939
|
}
|
|
15641
|
-
|
|
15940
|
+
await (0, import_promises27.rm)(import_node_path39.default.join(poolDir, entry), { recursive: true, force: true });
|
|
15941
|
+
await (0, import_promises27.rm)(lockPath, { force: true }).catch(() => {
|
|
15942
|
+
});
|
|
15642
15943
|
}
|
|
15643
|
-
|
|
15644
|
-
|
|
15645
|
-
|
|
15944
|
+
}
|
|
15945
|
+
await (0, import_promises27.rm)(import_node_path39.default.join(poolDir, "metadata.json"), { force: true }).catch(() => {
|
|
15946
|
+
});
|
|
15947
|
+
}
|
|
15948
|
+
/**
|
|
15949
|
+
* Reset an existing slot for reuse:
|
|
15950
|
+
* 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
|
|
15951
|
+
* 2. Re-copy template files (skip repo directories)
|
|
15952
|
+
*/
|
|
15953
|
+
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
15954
|
+
for (const repo of repos) {
|
|
15955
|
+
const repoDir = import_node_path39.default.join(slotPath, repo.path);
|
|
15956
|
+
if (!(0, import_node_fs11.existsSync)(repoDir)) {
|
|
15957
|
+
continue;
|
|
15958
|
+
}
|
|
15959
|
+
if (poolReset === "none") {
|
|
15960
|
+
continue;
|
|
15961
|
+
}
|
|
15962
|
+
const ref = repo.checkout?.ref ?? "HEAD";
|
|
15963
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
15964
|
+
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
15965
|
+
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
15966
|
+
}
|
|
15967
|
+
if (templatePath) {
|
|
15968
|
+
const repoDirNames = new Set(
|
|
15969
|
+
repos.map((r) => {
|
|
15970
|
+
const normalized = r.path.replace(/^\.\//, "");
|
|
15971
|
+
return normalized.split("/")[0];
|
|
15972
|
+
})
|
|
15646
15973
|
);
|
|
15974
|
+
await copyDirectoryRecursive2(templatePath, slotPath, repoDirNames);
|
|
15647
15975
|
}
|
|
15648
|
-
|
|
15649
|
-
|
|
15650
|
-
|
|
15976
|
+
}
|
|
15977
|
+
};
|
|
15978
|
+
|
|
15979
|
+
// src/evaluation/workspace/repo-manager.ts
|
|
15980
|
+
var import_node_child_process8 = require("child_process");
|
|
15981
|
+
var import_node_path40 = __toESM(require("path"), 1);
|
|
15982
|
+
var import_node_util6 = require("util");
|
|
15983
|
+
var execFileAsync2 = (0, import_node_util6.promisify)(import_node_child_process8.execFile);
|
|
15984
|
+
var DEFAULT_TIMEOUT_MS2 = 3e5;
|
|
15985
|
+
function gitEnv2() {
|
|
15986
|
+
const env = { ...process.env };
|
|
15987
|
+
for (const key of Object.keys(env)) {
|
|
15988
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
15989
|
+
delete env[key];
|
|
15990
|
+
}
|
|
15991
|
+
}
|
|
15992
|
+
return {
|
|
15993
|
+
...env,
|
|
15994
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
15995
|
+
GIT_ASKPASS: "",
|
|
15996
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
15997
|
+
};
|
|
15998
|
+
}
|
|
15999
|
+
function getSourceUrl(source) {
|
|
16000
|
+
return source.type === "git" ? source.url : source.path;
|
|
16001
|
+
}
|
|
16002
|
+
async function git2(args, opts) {
|
|
16003
|
+
const { stdout } = await execFileAsync2("git", args, {
|
|
16004
|
+
cwd: opts?.cwd,
|
|
16005
|
+
timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
|
|
16006
|
+
env: gitEnv2(),
|
|
16007
|
+
maxBuffer: 50 * 1024 * 1024
|
|
16008
|
+
// 50MB
|
|
16009
|
+
});
|
|
16010
|
+
return stdout.trim();
|
|
16011
|
+
}
|
|
16012
|
+
var RepoManager = class {
|
|
16013
|
+
verbose;
|
|
16014
|
+
constructor(verbose = false) {
|
|
16015
|
+
this.verbose = verbose;
|
|
16016
|
+
}
|
|
16017
|
+
async runGit(args, opts) {
|
|
16018
|
+
const startedAt = Date.now();
|
|
15651
16019
|
if (this.verbose) {
|
|
15652
|
-
console.log(
|
|
15653
|
-
`[repo] lock acquired path=${lockPath} waitedMs=${Date.now() - lockStartedAt}`
|
|
15654
|
-
);
|
|
16020
|
+
console.log(`[repo] git start cwd=${opts?.cwd ?? process.cwd()} args=${args.join(" ")}`);
|
|
15655
16021
|
}
|
|
15656
16022
|
try {
|
|
15657
|
-
|
|
15658
|
-
|
|
15659
|
-
|
|
15660
|
-
}
|
|
15661
|
-
const fetchArgs = ["fetch", "--prune"];
|
|
15662
|
-
if (depth) {
|
|
15663
|
-
fetchArgs.push("--depth", String(depth));
|
|
15664
|
-
}
|
|
15665
|
-
await this.runGit(fetchArgs, { cwd: cachePath });
|
|
15666
|
-
} else {
|
|
15667
|
-
if (this.verbose) {
|
|
15668
|
-
console.log(`[repo] creating new cache ${cachePath}`);
|
|
15669
|
-
}
|
|
15670
|
-
const cloneArgs = ["clone", "--mirror", "--bare"];
|
|
15671
|
-
if (depth) {
|
|
15672
|
-
cloneArgs.push("--depth", String(depth));
|
|
15673
|
-
}
|
|
15674
|
-
const sourceUrl = getSourceUrl(source);
|
|
15675
|
-
const cloneUrl = depth && source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
15676
|
-
cloneArgs.push(cloneUrl, cachePath);
|
|
15677
|
-
await this.runGit(cloneArgs);
|
|
16023
|
+
const output = await git2(args, opts);
|
|
16024
|
+
if (this.verbose) {
|
|
16025
|
+
console.log(`[repo] git ok durationMs=${Date.now() - startedAt} args=${args.join(" ")}`);
|
|
15678
16026
|
}
|
|
15679
|
-
|
|
15680
|
-
|
|
16027
|
+
return output;
|
|
16028
|
+
} catch (error) {
|
|
15681
16029
|
if (this.verbose) {
|
|
15682
|
-
|
|
16030
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16031
|
+
console.log(
|
|
16032
|
+
`[repo] git fail durationMs=${Date.now() - startedAt} args=${args.join(" ")} error=${message}`
|
|
16033
|
+
);
|
|
15683
16034
|
}
|
|
16035
|
+
throw error;
|
|
15684
16036
|
}
|
|
15685
|
-
return cachePath;
|
|
15686
16037
|
}
|
|
15687
16038
|
/**
|
|
15688
|
-
* Clone a repo from
|
|
16039
|
+
* Clone a repo directly from source into the workspace at the configured path.
|
|
15689
16040
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
15690
16041
|
*/
|
|
15691
16042
|
async materialize(repo, workspacePath) {
|
|
15692
|
-
const targetDir =
|
|
16043
|
+
const targetDir = import_node_path40.default.join(workspacePath, repo.path);
|
|
16044
|
+
const sourceUrl = getSourceUrl(repo.source);
|
|
15693
16045
|
const startedAt = Date.now();
|
|
15694
16046
|
if (this.verbose) {
|
|
15695
16047
|
console.log(
|
|
15696
|
-
`[repo] materialize start path=${repo.path} source=${
|
|
16048
|
+
`[repo] materialize start path=${repo.path} source=${sourceUrl} workspace=${workspacePath}`
|
|
15697
16049
|
);
|
|
15698
16050
|
}
|
|
15699
|
-
const cachePath = await this.ensureCache(
|
|
15700
|
-
repo.source,
|
|
15701
|
-
repo.clone?.depth,
|
|
15702
|
-
repo.checkout?.resolve
|
|
15703
|
-
);
|
|
15704
16051
|
const cloneArgs = ["clone"];
|
|
15705
16052
|
if (repo.clone?.depth) {
|
|
15706
16053
|
cloneArgs.push("--depth", String(repo.clone.depth));
|
|
@@ -15709,7 +16056,7 @@ var RepoManager = class {
|
|
|
15709
16056
|
cloneArgs.push("--filter", repo.clone.filter);
|
|
15710
16057
|
}
|
|
15711
16058
|
cloneArgs.push("--no-checkout");
|
|
15712
|
-
const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${
|
|
16059
|
+
const cloneUrl = (repo.clone?.depth || repo.clone?.filter) && repo.source.type === "local" ? `file://${sourceUrl}` : sourceUrl;
|
|
15713
16060
|
cloneArgs.push(cloneUrl, targetDir);
|
|
15714
16061
|
await this.runGit(cloneArgs);
|
|
15715
16062
|
if (repo.clone?.sparse?.length) {
|
|
@@ -15781,66 +16128,28 @@ var RepoManager = class {
|
|
|
15781
16128
|
}
|
|
15782
16129
|
}
|
|
15783
16130
|
/** Reset repos in workspace to their checkout state. */
|
|
15784
|
-
async reset(repos, workspacePath,
|
|
15785
|
-
|
|
15786
|
-
for (const repo of repos) {
|
|
15787
|
-
const targetDir = import_node_path39.default.join(workspacePath, repo.path);
|
|
15788
|
-
await (0, import_promises27.rm)(targetDir, { recursive: true, force: true });
|
|
15789
|
-
}
|
|
15790
|
-
await this.materializeAll(repos, workspacePath);
|
|
15791
|
-
return;
|
|
15792
|
-
}
|
|
16131
|
+
async reset(repos, workspacePath, reset) {
|
|
16132
|
+
const cleanFlag = reset === "strict" ? "-fdx" : "-fd";
|
|
15793
16133
|
for (const repo of repos) {
|
|
15794
|
-
const targetDir =
|
|
16134
|
+
const targetDir = import_node_path40.default.join(workspacePath, repo.path);
|
|
15795
16135
|
await this.runGit(["reset", "--hard", "HEAD"], { cwd: targetDir });
|
|
15796
|
-
await this.runGit(["clean",
|
|
15797
|
-
}
|
|
15798
|
-
}
|
|
15799
|
-
/**
|
|
15800
|
-
* Seed the cache from a local repository, setting the remote to a given URL.
|
|
15801
|
-
* Useful for avoiding slow network clones when a local clone already exists.
|
|
15802
|
-
*/
|
|
15803
|
-
async seedCache(localPath, remoteUrl, opts) {
|
|
15804
|
-
const source = { type: "git", url: remoteUrl };
|
|
15805
|
-
const key = cacheKey(source);
|
|
15806
|
-
const cachePath = import_node_path39.default.join(this.cacheDir, key);
|
|
15807
|
-
const lockPath = `${cachePath}.lock`;
|
|
15808
|
-
await (0, import_promises27.mkdir)(this.cacheDir, { recursive: true });
|
|
15809
|
-
await acquireLock(lockPath);
|
|
15810
|
-
try {
|
|
15811
|
-
if ((0, import_node_fs11.existsSync)(import_node_path39.default.join(cachePath, "HEAD"))) {
|
|
15812
|
-
if (!opts?.force) {
|
|
15813
|
-
throw new Error(
|
|
15814
|
-
`Cache already exists for ${remoteUrl} at ${cachePath}. Use force to overwrite.`
|
|
15815
|
-
);
|
|
15816
|
-
}
|
|
15817
|
-
await (0, import_promises27.rm)(cachePath, { recursive: true, force: true });
|
|
15818
|
-
}
|
|
15819
|
-
await git(["clone", "--mirror", "--bare", localPath, cachePath]);
|
|
15820
|
-
await git(["remote", "set-url", "origin", remoteUrl], { cwd: cachePath });
|
|
15821
|
-
} finally {
|
|
15822
|
-
await releaseLock(lockPath);
|
|
16136
|
+
await this.runGit(["clean", cleanFlag], { cwd: targetDir });
|
|
15823
16137
|
}
|
|
15824
|
-
return cachePath;
|
|
15825
|
-
}
|
|
15826
|
-
/** Remove the entire cache directory. */
|
|
15827
|
-
async cleanCache() {
|
|
15828
|
-
await (0, import_promises27.rm)(this.cacheDir, { recursive: true, force: true });
|
|
15829
16138
|
}
|
|
15830
16139
|
};
|
|
15831
16140
|
|
|
15832
16141
|
// src/evaluation/workspace/resolve.ts
|
|
15833
16142
|
var import_promises28 = require("fs/promises");
|
|
15834
|
-
var
|
|
16143
|
+
var import_node_path41 = __toESM(require("path"), 1);
|
|
15835
16144
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
15836
16145
|
if (!templatePath) {
|
|
15837
16146
|
return void 0;
|
|
15838
16147
|
}
|
|
15839
|
-
const resolved =
|
|
16148
|
+
const resolved = import_node_path41.default.resolve(templatePath);
|
|
15840
16149
|
const stats = await (0, import_promises28.stat)(resolved);
|
|
15841
16150
|
if (stats.isFile()) {
|
|
15842
16151
|
return {
|
|
15843
|
-
dir:
|
|
16152
|
+
dir: import_node_path41.default.dirname(resolved),
|
|
15844
16153
|
workspaceFile: resolved
|
|
15845
16154
|
};
|
|
15846
16155
|
}
|
|
@@ -15852,14 +16161,14 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
15852
16161
|
if (workspaceFiles.length === 1) {
|
|
15853
16162
|
return {
|
|
15854
16163
|
dir: resolved,
|
|
15855
|
-
workspaceFile:
|
|
16164
|
+
workspaceFile: import_node_path41.default.join(resolved, workspaceFiles[0])
|
|
15856
16165
|
};
|
|
15857
16166
|
}
|
|
15858
16167
|
if (workspaceFiles.length > 1) {
|
|
15859
16168
|
const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
|
|
15860
16169
|
return {
|
|
15861
16170
|
dir: resolved,
|
|
15862
|
-
workspaceFile: conventionFile ?
|
|
16171
|
+
workspaceFile: conventionFile ? import_node_path41.default.join(resolved, conventionFile) : void 0
|
|
15863
16172
|
};
|
|
15864
16173
|
}
|
|
15865
16174
|
return { dir: resolved };
|
|
@@ -15911,6 +16220,22 @@ function classifyQualityStatus(score) {
|
|
|
15911
16220
|
function usesFileReferencePrompt(provider) {
|
|
15912
16221
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
15913
16222
|
}
|
|
16223
|
+
function toScriptConfig(hook, hookName, context2) {
|
|
16224
|
+
const command = hook.command ?? hook.script;
|
|
16225
|
+
if (!command || command.length === 0) {
|
|
16226
|
+
throw new Error(`${hookName} hook in ${context2} requires command or script`);
|
|
16227
|
+
}
|
|
16228
|
+
return {
|
|
16229
|
+
command,
|
|
16230
|
+
...hook.timeout_ms !== void 0 && { timeout_ms: hook.timeout_ms },
|
|
16231
|
+
...hook.timeoutMs !== void 0 && { timeoutMs: hook.timeoutMs },
|
|
16232
|
+
...hook.cwd !== void 0 && { cwd: hook.cwd },
|
|
16233
|
+
...hook.script !== void 0 && { script: hook.script }
|
|
16234
|
+
};
|
|
16235
|
+
}
|
|
16236
|
+
function hasHookCommand(hook) {
|
|
16237
|
+
return !!(hook?.command && hook.command.length > 0 || hook?.script && hook.script.length > 0);
|
|
16238
|
+
}
|
|
15914
16239
|
function getWorkspaceTemplate(target) {
|
|
15915
16240
|
const config = target.config;
|
|
15916
16241
|
if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
|
|
@@ -15941,7 +16266,15 @@ async function runEvaluation(options) {
|
|
|
15941
16266
|
trials,
|
|
15942
16267
|
streamCallbacks,
|
|
15943
16268
|
totalBudgetUsd,
|
|
15944
|
-
failOnError
|
|
16269
|
+
failOnError,
|
|
16270
|
+
poolWorkspaces,
|
|
16271
|
+
poolMaxSlots: configPoolMaxSlots,
|
|
16272
|
+
workspace: legacyWorkspacePath,
|
|
16273
|
+
workspaceMode,
|
|
16274
|
+
workspacePath,
|
|
16275
|
+
workspaceClean,
|
|
16276
|
+
retainOnSuccess,
|
|
16277
|
+
retainOnFailure
|
|
15945
16278
|
} = options;
|
|
15946
16279
|
let useCache = options.useCache;
|
|
15947
16280
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -16015,7 +16348,7 @@ async function runEvaluation(options) {
|
|
|
16015
16348
|
];
|
|
16016
16349
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
16017
16350
|
const typeRegistry = createBuiltinRegistry();
|
|
16018
|
-
const discoveryBaseDir = evalFilePath ?
|
|
16351
|
+
const discoveryBaseDir = evalFilePath ? import_node_path42.default.dirname(import_node_path42.default.resolve(evalFilePath)) : process.cwd();
|
|
16019
16352
|
const evalDir = discoveryBaseDir;
|
|
16020
16353
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
16021
16354
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
@@ -16077,13 +16410,29 @@ async function runEvaluation(options) {
|
|
|
16077
16410
|
}
|
|
16078
16411
|
};
|
|
16079
16412
|
const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
|
|
16080
|
-
const
|
|
16413
|
+
const configuredMode = suiteWorkspace?.mode ?? workspaceMode;
|
|
16414
|
+
const configuredStaticPath = suiteWorkspace?.static_path ?? workspacePath ?? legacyWorkspacePath;
|
|
16415
|
+
const useStaticWorkspace = configuredMode === "static" || !!configuredStaticPath && !configuredMode;
|
|
16416
|
+
if (useStaticWorkspace && isPerTestIsolation) {
|
|
16417
|
+
throw new Error(
|
|
16418
|
+
"static workspace mode is incompatible with isolation: per_test. Use isolation: shared (default)."
|
|
16419
|
+
);
|
|
16420
|
+
}
|
|
16421
|
+
if (configuredMode === "static" && !configuredStaticPath) {
|
|
16422
|
+
throw new Error("workspace.mode=static requires workspace.static_path or --workspace-path");
|
|
16423
|
+
}
|
|
16424
|
+
const hasSharedWorkspace = !!(useStaticWorkspace || workspaceTemplate || suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation);
|
|
16425
|
+
const poolEnabled = configuredMode === "pooled" ? true : configuredMode === "ephemeral" || useStaticWorkspace ? false : suiteWorkspace?.pool ?? poolWorkspaces ?? true;
|
|
16426
|
+
const usePool = poolEnabled !== false && !!suiteWorkspace?.repos?.length && !isPerTestIsolation && !useStaticWorkspace;
|
|
16427
|
+
const finishCleanPolicy = suiteWorkspace?.hooks?.on_finish?.clean;
|
|
16428
|
+
const resolvedRetainOnSuccess = (finishCleanPolicy === "always" || finishCleanPolicy === "on_success" ? "cleanup" : finishCleanPolicy === "on_failure" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup");
|
|
16429
|
+
const resolvedRetainOnFailure = (finishCleanPolicy === "always" || finishCleanPolicy === "on_failure" ? "cleanup" : finishCleanPolicy === "on_success" || finishCleanPolicy === "never" ? "keep" : void 0) ?? retainOnFailure ?? (cleanupWorkspaces ? "cleanup" : "keep");
|
|
16081
16430
|
const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
|
|
16082
|
-
const workers = hasSharedWorkspace ? 1 : requestedWorkers;
|
|
16431
|
+
const workers = hasSharedWorkspace && !usePool ? 1 : requestedWorkers;
|
|
16083
16432
|
setupLog(
|
|
16084
|
-
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
16433
|
+
`sharedWorkspace=${hasSharedWorkspace} perTestIsolation=${isPerTestIsolation} usePool=${usePool} requestedWorkers=${requestedWorkers} effectiveWorkers=${workers}`
|
|
16085
16434
|
);
|
|
16086
|
-
if (hasSharedWorkspace && requestedWorkers > 1) {
|
|
16435
|
+
if (hasSharedWorkspace && !usePool && requestedWorkers > 1) {
|
|
16087
16436
|
console.warn(
|
|
16088
16437
|
`Warning: Shared workspace requires sequential execution. Overriding workers from ${requestedWorkers} to 1.`
|
|
16089
16438
|
);
|
|
@@ -16092,7 +16441,38 @@ async function runEvaluation(options) {
|
|
|
16092
16441
|
let sharedWorkspacePath;
|
|
16093
16442
|
let sharedBaselineCommit;
|
|
16094
16443
|
let beforeAllOutput;
|
|
16095
|
-
|
|
16444
|
+
let poolManager;
|
|
16445
|
+
let poolSlot;
|
|
16446
|
+
const poolSlots = [];
|
|
16447
|
+
const availablePoolSlots = [];
|
|
16448
|
+
const poolSlotBaselines = /* @__PURE__ */ new Map();
|
|
16449
|
+
const poolMaxSlots = Math.min(configPoolMaxSlots ?? 10, 50);
|
|
16450
|
+
if (useStaticWorkspace && configuredStaticPath) {
|
|
16451
|
+
sharedWorkspacePath = configuredStaticPath;
|
|
16452
|
+
setupLog(`using static workspace: ${configuredStaticPath}`);
|
|
16453
|
+
} else if (usePool && suiteWorkspace?.repos) {
|
|
16454
|
+
const slotsNeeded = workers;
|
|
16455
|
+
setupLog(`acquiring ${slotsNeeded} workspace pool slot(s) (pool capacity: ${poolMaxSlots})`);
|
|
16456
|
+
poolManager = new WorkspacePoolManager(getWorkspacePoolRoot());
|
|
16457
|
+
const poolRepoManager = new RepoManager(verbose);
|
|
16458
|
+
for (let i = 0; i < slotsNeeded; i++) {
|
|
16459
|
+
const slot = await poolManager.acquireWorkspace({
|
|
16460
|
+
templatePath: workspaceTemplate,
|
|
16461
|
+
repos: suiteWorkspace.repos,
|
|
16462
|
+
maxSlots: poolMaxSlots,
|
|
16463
|
+
repoManager: poolRepoManager,
|
|
16464
|
+
poolReset: (workspaceClean === "full" ? "strict" : workspaceClean === "standard" ? "fast" : null) ?? suiteWorkspace.hooks?.on_reuse?.reset ?? "fast"
|
|
16465
|
+
});
|
|
16466
|
+
poolSlots.push(slot);
|
|
16467
|
+
setupLog(`pool slot ${i} acquired at: ${slot.path} (existing=${slot.isExisting})`);
|
|
16468
|
+
}
|
|
16469
|
+
if (slotsNeeded === 1) {
|
|
16470
|
+
poolSlot = poolSlots[0];
|
|
16471
|
+
sharedWorkspacePath = poolSlot.path;
|
|
16472
|
+
} else {
|
|
16473
|
+
availablePoolSlots.push(...poolSlots);
|
|
16474
|
+
}
|
|
16475
|
+
} else if (workspaceTemplate) {
|
|
16096
16476
|
setupLog(`creating shared workspace from template: ${workspaceTemplate}`);
|
|
16097
16477
|
try {
|
|
16098
16478
|
sharedWorkspacePath = await createTempWorkspace(workspaceTemplate, evalRunId, "shared");
|
|
@@ -16101,288 +16481,359 @@ async function runEvaluation(options) {
|
|
|
16101
16481
|
const message = error instanceof Error ? error.message : String(error);
|
|
16102
16482
|
throw new Error(`Failed to create shared workspace: ${message}`);
|
|
16103
16483
|
}
|
|
16484
|
+
} else if (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
|
|
16485
|
+
sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
|
|
16486
|
+
await (0, import_promises29.mkdir)(sharedWorkspacePath, { recursive: true });
|
|
16487
|
+
setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
|
|
16488
|
+
}
|
|
16489
|
+
try {
|
|
16104
16490
|
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
16105
|
-
const copiedWorkspaceFile =
|
|
16491
|
+
const copiedWorkspaceFile = import_node_path42.default.join(sharedWorkspacePath, import_node_path42.default.basename(suiteWorkspaceFile));
|
|
16106
16492
|
try {
|
|
16107
16493
|
await (0, import_promises29.stat)(copiedWorkspaceFile);
|
|
16108
16494
|
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
16109
16495
|
} catch {
|
|
16110
16496
|
}
|
|
16111
16497
|
}
|
|
16112
|
-
|
|
16113
|
-
sharedWorkspacePath
|
|
16114
|
-
|
|
16115
|
-
|
|
16116
|
-
|
|
16117
|
-
|
|
16118
|
-
|
|
16119
|
-
|
|
16120
|
-
|
|
16121
|
-
|
|
16122
|
-
|
|
16123
|
-
|
|
16124
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
16125
|
-
if (sharedWorkspacePath) {
|
|
16126
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16127
|
-
});
|
|
16128
|
-
}
|
|
16129
|
-
throw new Error(`Failed to materialize repos: ${message}`);
|
|
16130
|
-
}
|
|
16131
|
-
}
|
|
16132
|
-
if (sharedWorkspacePath && suiteWorkspace?.before_all) {
|
|
16133
|
-
const beforeAllCommand = (suiteWorkspace.before_all.command ?? suiteWorkspace.before_all.script ?? []).join(" ");
|
|
16134
|
-
setupLog(
|
|
16135
|
-
`running shared before_all in cwd=${suiteWorkspace.before_all.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
16136
|
-
);
|
|
16137
|
-
const scriptContext = {
|
|
16138
|
-
workspacePath: sharedWorkspacePath,
|
|
16139
|
-
testId: "__before_all__",
|
|
16140
|
-
evalRunId,
|
|
16141
|
-
evalDir
|
|
16142
|
-
};
|
|
16143
|
-
try {
|
|
16144
|
-
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
16145
|
-
setupLog("shared before_all completed");
|
|
16146
|
-
} catch (error) {
|
|
16147
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
16148
|
-
if (sharedWorkspacePath) {
|
|
16149
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16150
|
-
});
|
|
16151
|
-
}
|
|
16152
|
-
throw new Error(`before_all script failed: ${message}`);
|
|
16153
|
-
}
|
|
16154
|
-
}
|
|
16155
|
-
if (sharedWorkspacePath) {
|
|
16156
|
-
try {
|
|
16157
|
-
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
16158
|
-
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
16159
|
-
} catch {
|
|
16160
|
-
setupLog("shared baseline initialization skipped (non-fatal)");
|
|
16161
|
-
}
|
|
16162
|
-
}
|
|
16163
|
-
let nextWorkerId = 1;
|
|
16164
|
-
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
16165
|
-
let beforeAllOutputAttached = false;
|
|
16166
|
-
let cumulativeBudgetCost = 0;
|
|
16167
|
-
let budgetExhausted = false;
|
|
16168
|
-
let failOnErrorTriggered = false;
|
|
16169
|
-
const promises = filteredEvalCases.map(
|
|
16170
|
-
(evalCase) => limit(async () => {
|
|
16171
|
-
const workerId = nextWorkerId++;
|
|
16172
|
-
workerIdByEvalId.set(evalCase.id, workerId);
|
|
16173
|
-
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
16174
|
-
const budgetResult = {
|
|
16175
|
-
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16176
|
-
testId: evalCase.id,
|
|
16177
|
-
dataset: evalCase.dataset,
|
|
16178
|
-
score: 0,
|
|
16179
|
-
hits: [],
|
|
16180
|
-
misses: [],
|
|
16181
|
-
answer: "",
|
|
16182
|
-
target: target.name,
|
|
16183
|
-
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16184
|
-
budgetExceeded: true,
|
|
16185
|
-
executionStatus: "execution_error",
|
|
16186
|
-
failureStage: "setup",
|
|
16187
|
-
failureReasonCode: "budget_exceeded",
|
|
16188
|
-
executionError: {
|
|
16189
|
-
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16190
|
-
stage: "setup"
|
|
16191
|
-
}
|
|
16192
|
-
};
|
|
16193
|
-
if (onProgress) {
|
|
16194
|
-
await onProgress({
|
|
16195
|
-
workerId,
|
|
16196
|
-
testId: evalCase.id,
|
|
16197
|
-
status: "failed",
|
|
16198
|
-
completedAt: Date.now(),
|
|
16199
|
-
error: budgetResult.error
|
|
16498
|
+
const repoManager = suiteWorkspace?.repos?.length && !usePool && !useStaticWorkspace ? new RepoManager(verbose) : void 0;
|
|
16499
|
+
if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
|
|
16500
|
+
setupLog(
|
|
16501
|
+
`materializing ${suiteWorkspace.repos.length} shared repo(s) into ${sharedWorkspacePath}`
|
|
16502
|
+
);
|
|
16503
|
+
try {
|
|
16504
|
+
await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
|
|
16505
|
+
setupLog("shared repo materialization complete");
|
|
16506
|
+
} catch (error) {
|
|
16507
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16508
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
16509
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16200
16510
|
});
|
|
16201
16511
|
}
|
|
16202
|
-
|
|
16203
|
-
await onResult(budgetResult);
|
|
16204
|
-
}
|
|
16205
|
-
return budgetResult;
|
|
16512
|
+
throw new Error(`Failed to materialize repos: ${message}`);
|
|
16206
16513
|
}
|
|
16207
|
-
|
|
16208
|
-
|
|
16209
|
-
|
|
16210
|
-
|
|
16211
|
-
|
|
16212
|
-
|
|
16213
|
-
|
|
16214
|
-
|
|
16215
|
-
|
|
16216
|
-
|
|
16217
|
-
|
|
16218
|
-
|
|
16219
|
-
|
|
16220
|
-
|
|
16221
|
-
|
|
16222
|
-
|
|
16223
|
-
|
|
16224
|
-
|
|
16225
|
-
|
|
16226
|
-
|
|
16227
|
-
|
|
16228
|
-
|
|
16229
|
-
|
|
16230
|
-
|
|
16514
|
+
}
|
|
16515
|
+
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all_tests;
|
|
16516
|
+
if (sharedWorkspacePath && hasHookCommand(suiteBeforeAllHook)) {
|
|
16517
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
16518
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
16519
|
+
setupLog(
|
|
16520
|
+
`running shared before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
16521
|
+
);
|
|
16522
|
+
const scriptContext = {
|
|
16523
|
+
workspacePath: sharedWorkspacePath,
|
|
16524
|
+
testId: "__before_all__",
|
|
16525
|
+
evalRunId,
|
|
16526
|
+
evalDir
|
|
16527
|
+
};
|
|
16528
|
+
try {
|
|
16529
|
+
beforeAllOutput = await executeWorkspaceScript(
|
|
16530
|
+
toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
|
|
16531
|
+
scriptContext
|
|
16532
|
+
);
|
|
16533
|
+
setupLog("shared before_all completed");
|
|
16534
|
+
} catch (error) {
|
|
16535
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16536
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
16537
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16231
16538
|
});
|
|
16232
16539
|
}
|
|
16233
|
-
|
|
16234
|
-
await onResult(haltResult);
|
|
16235
|
-
}
|
|
16236
|
-
return haltResult;
|
|
16237
|
-
}
|
|
16238
|
-
if (onProgress) {
|
|
16239
|
-
await onProgress({
|
|
16240
|
-
workerId,
|
|
16241
|
-
testId: evalCase.id,
|
|
16242
|
-
status: "running",
|
|
16243
|
-
startedAt: Date.now()
|
|
16244
|
-
});
|
|
16540
|
+
throw new Error(`before_all script failed: ${message}`);
|
|
16245
16541
|
}
|
|
16246
|
-
|
|
16247
|
-
|
|
16248
|
-
|
|
16249
|
-
|
|
16250
|
-
|
|
16251
|
-
|
|
16252
|
-
|
|
16253
|
-
|
|
16254
|
-
agentTimeoutMs,
|
|
16255
|
-
cache,
|
|
16256
|
-
useCache,
|
|
16257
|
-
now,
|
|
16258
|
-
judgeProvider,
|
|
16259
|
-
targetResolver,
|
|
16260
|
-
availableTargets,
|
|
16542
|
+
}
|
|
16543
|
+
if (availablePoolSlots.length > 0 && hasHookCommand(suiteBeforeAllHook)) {
|
|
16544
|
+
const beforeAllHook = suiteBeforeAllHook;
|
|
16545
|
+
for (const slot of availablePoolSlots) {
|
|
16546
|
+
setupLog(`running before_all on pool slot ${slot.index}`);
|
|
16547
|
+
const scriptContext = {
|
|
16548
|
+
workspacePath: slot.path,
|
|
16549
|
+
testId: "__before_all__",
|
|
16261
16550
|
evalRunId,
|
|
16262
|
-
keepWorkspaces,
|
|
16263
|
-
cleanupWorkspaces,
|
|
16264
|
-
sharedWorkspacePath,
|
|
16265
|
-
sharedBaselineCommit,
|
|
16266
|
-
suiteWorkspaceFile,
|
|
16267
|
-
streamCallbacks,
|
|
16268
|
-
typeRegistry,
|
|
16269
|
-
repoManager,
|
|
16270
16551
|
evalDir
|
|
16271
16552
|
};
|
|
16272
|
-
|
|
16273
|
-
|
|
16274
|
-
|
|
16275
|
-
|
|
16276
|
-
|
|
16277
|
-
|
|
16278
|
-
|
|
16553
|
+
try {
|
|
16554
|
+
const output = await executeWorkspaceScript(
|
|
16555
|
+
toScriptConfig(beforeAllHook, "before_all_tests", "suite workspace"),
|
|
16556
|
+
scriptContext
|
|
16557
|
+
);
|
|
16558
|
+
if (!beforeAllOutput) beforeAllOutput = output;
|
|
16559
|
+
setupLog(`before_all completed on pool slot ${slot.index}`);
|
|
16560
|
+
} catch (error) {
|
|
16561
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16562
|
+
throw new Error(`before_all script failed on pool slot ${slot.index}: ${message}`);
|
|
16563
|
+
}
|
|
16564
|
+
}
|
|
16565
|
+
}
|
|
16566
|
+
if (sharedWorkspacePath) {
|
|
16567
|
+
try {
|
|
16568
|
+
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
16569
|
+
setupLog(`shared baseline initialized: ${sharedBaselineCommit}`);
|
|
16570
|
+
} catch {
|
|
16571
|
+
setupLog("shared baseline initialization skipped (non-fatal)");
|
|
16572
|
+
}
|
|
16573
|
+
}
|
|
16574
|
+
if (availablePoolSlots.length > 0) {
|
|
16575
|
+
for (const slot of availablePoolSlots) {
|
|
16576
|
+
try {
|
|
16577
|
+
const baseline = await initializeBaseline(slot.path);
|
|
16578
|
+
poolSlotBaselines.set(slot.path, baseline);
|
|
16579
|
+
setupLog(`pool slot ${slot.index} baseline initialized: ${baseline}`);
|
|
16580
|
+
} catch {
|
|
16581
|
+
setupLog(`pool slot ${slot.index} baseline initialization skipped (non-fatal)`);
|
|
16582
|
+
}
|
|
16583
|
+
}
|
|
16584
|
+
}
|
|
16585
|
+
let nextWorkerId = 1;
|
|
16586
|
+
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
16587
|
+
let beforeAllOutputAttached = false;
|
|
16588
|
+
let cumulativeBudgetCost = 0;
|
|
16589
|
+
let budgetExhausted = false;
|
|
16590
|
+
let failOnErrorTriggered = false;
|
|
16591
|
+
const promises = filteredEvalCases.map(
|
|
16592
|
+
(evalCase) => limit(async () => {
|
|
16593
|
+
const workerId = nextWorkerId++;
|
|
16594
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
16595
|
+
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
16596
|
+
const budgetResult = {
|
|
16597
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16598
|
+
testId: evalCase.id,
|
|
16599
|
+
dataset: evalCase.dataset,
|
|
16600
|
+
score: 0,
|
|
16601
|
+
hits: [],
|
|
16602
|
+
misses: [],
|
|
16603
|
+
answer: "",
|
|
16604
|
+
target: target.name,
|
|
16605
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16606
|
+
budgetExceeded: true,
|
|
16607
|
+
executionStatus: "execution_error",
|
|
16608
|
+
failureStage: "setup",
|
|
16609
|
+
failureReasonCode: "budget_exceeded",
|
|
16610
|
+
executionError: {
|
|
16611
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
16612
|
+
stage: "setup"
|
|
16279
16613
|
}
|
|
16280
|
-
}
|
|
16281
|
-
|
|
16614
|
+
};
|
|
16615
|
+
if (onProgress) {
|
|
16616
|
+
await onProgress({
|
|
16617
|
+
workerId,
|
|
16618
|
+
testId: evalCase.id,
|
|
16619
|
+
status: "failed",
|
|
16620
|
+
completedAt: Date.now(),
|
|
16621
|
+
error: budgetResult.error
|
|
16622
|
+
});
|
|
16282
16623
|
}
|
|
16283
|
-
if (
|
|
16284
|
-
|
|
16285
|
-
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
16286
|
-
budgetExhausted = true;
|
|
16287
|
-
}
|
|
16624
|
+
if (onResult) {
|
|
16625
|
+
await onResult(budgetResult);
|
|
16288
16626
|
}
|
|
16627
|
+
return budgetResult;
|
|
16289
16628
|
}
|
|
16290
|
-
if (failOnError === true &&
|
|
16291
|
-
|
|
16292
|
-
|
|
16293
|
-
|
|
16294
|
-
|
|
16295
|
-
|
|
16629
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
16630
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
16631
|
+
const haltResult = {
|
|
16632
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16633
|
+
testId: evalCase.id,
|
|
16634
|
+
dataset: evalCase.dataset,
|
|
16635
|
+
score: 0,
|
|
16636
|
+
hits: [],
|
|
16637
|
+
misses: [],
|
|
16638
|
+
answer: "",
|
|
16639
|
+
target: target.name,
|
|
16640
|
+
error: errorMsg,
|
|
16641
|
+
executionStatus: "execution_error",
|
|
16642
|
+
failureStage: "setup",
|
|
16643
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
16644
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
16645
|
+
};
|
|
16646
|
+
if (onProgress) {
|
|
16647
|
+
await onProgress({
|
|
16648
|
+
workerId,
|
|
16649
|
+
testId: evalCase.id,
|
|
16650
|
+
status: "failed",
|
|
16651
|
+
completedAt: Date.now(),
|
|
16652
|
+
error: haltResult.error
|
|
16653
|
+
});
|
|
16654
|
+
}
|
|
16655
|
+
if (onResult) {
|
|
16656
|
+
await onResult(haltResult);
|
|
16657
|
+
}
|
|
16658
|
+
return haltResult;
|
|
16296
16659
|
}
|
|
16297
16660
|
if (onProgress) {
|
|
16298
16661
|
await onProgress({
|
|
16299
16662
|
workerId,
|
|
16300
16663
|
testId: evalCase.id,
|
|
16301
|
-
status:
|
|
16302
|
-
startedAt:
|
|
16303
|
-
// Not used for completed status
|
|
16304
|
-
completedAt: Date.now(),
|
|
16305
|
-
error: result.error
|
|
16664
|
+
status: "running",
|
|
16665
|
+
startedAt: Date.now()
|
|
16306
16666
|
});
|
|
16307
16667
|
}
|
|
16308
|
-
|
|
16309
|
-
|
|
16668
|
+
const testPoolSlot = availablePoolSlots.length > 0 ? availablePoolSlots.pop() : void 0;
|
|
16669
|
+
const testWorkspacePath = testPoolSlot?.path ?? sharedWorkspacePath;
|
|
16670
|
+
const testBaselineCommit = testPoolSlot ? poolSlotBaselines.get(testPoolSlot.path) : sharedBaselineCommit;
|
|
16671
|
+
try {
|
|
16672
|
+
const judgeProvider = await resolveJudgeProvider(target);
|
|
16673
|
+
const runCaseOptions = {
|
|
16674
|
+
evalCase,
|
|
16675
|
+
provider: primaryProvider,
|
|
16676
|
+
target,
|
|
16677
|
+
evaluators: evaluatorRegistry,
|
|
16678
|
+
maxRetries,
|
|
16679
|
+
agentTimeoutMs,
|
|
16680
|
+
cache,
|
|
16681
|
+
useCache,
|
|
16682
|
+
now,
|
|
16683
|
+
judgeProvider,
|
|
16684
|
+
targetResolver,
|
|
16685
|
+
availableTargets,
|
|
16686
|
+
evalRunId,
|
|
16687
|
+
keepWorkspaces,
|
|
16688
|
+
cleanupWorkspaces,
|
|
16689
|
+
retainOnSuccess: resolvedRetainOnSuccess,
|
|
16690
|
+
retainOnFailure: resolvedRetainOnFailure,
|
|
16691
|
+
sharedWorkspacePath: testWorkspacePath,
|
|
16692
|
+
sharedBaselineCommit: testBaselineCommit,
|
|
16693
|
+
suiteWorkspaceFile,
|
|
16694
|
+
streamCallbacks,
|
|
16695
|
+
typeRegistry,
|
|
16696
|
+
repoManager,
|
|
16697
|
+
evalDir
|
|
16698
|
+
};
|
|
16699
|
+
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
16700
|
+
if (totalBudgetUsd !== void 0) {
|
|
16701
|
+
let caseCost;
|
|
16702
|
+
if (result.trials && result.trials.length > 0) {
|
|
16703
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
16704
|
+
if (trialCostSum > 0) {
|
|
16705
|
+
caseCost = trialCostSum;
|
|
16706
|
+
}
|
|
16707
|
+
} else {
|
|
16708
|
+
caseCost = result.costUsd;
|
|
16709
|
+
}
|
|
16710
|
+
if (caseCost !== void 0) {
|
|
16711
|
+
cumulativeBudgetCost += caseCost;
|
|
16712
|
+
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
16713
|
+
budgetExhausted = true;
|
|
16714
|
+
}
|
|
16715
|
+
}
|
|
16716
|
+
}
|
|
16717
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
16718
|
+
failOnErrorTriggered = true;
|
|
16719
|
+
}
|
|
16720
|
+
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
16721
|
+
result = { ...result, beforeAllOutput };
|
|
16722
|
+
beforeAllOutputAttached = true;
|
|
16723
|
+
}
|
|
16724
|
+
if (onProgress) {
|
|
16725
|
+
await onProgress({
|
|
16726
|
+
workerId,
|
|
16727
|
+
testId: evalCase.id,
|
|
16728
|
+
status: result.error ? "failed" : "completed",
|
|
16729
|
+
startedAt: 0,
|
|
16730
|
+
// Not used for completed status
|
|
16731
|
+
completedAt: Date.now(),
|
|
16732
|
+
error: result.error
|
|
16733
|
+
});
|
|
16734
|
+
}
|
|
16735
|
+
if (onResult) {
|
|
16736
|
+
await onResult(result);
|
|
16737
|
+
}
|
|
16738
|
+
return result;
|
|
16739
|
+
} catch (error) {
|
|
16740
|
+
if (onProgress) {
|
|
16741
|
+
await onProgress({
|
|
16742
|
+
workerId,
|
|
16743
|
+
testId: evalCase.id,
|
|
16744
|
+
status: "failed",
|
|
16745
|
+
completedAt: Date.now(),
|
|
16746
|
+
error: error instanceof Error ? error.message : String(error)
|
|
16747
|
+
});
|
|
16748
|
+
}
|
|
16749
|
+
throw error;
|
|
16750
|
+
} finally {
|
|
16751
|
+
if (testPoolSlot) {
|
|
16752
|
+
availablePoolSlots.push(testPoolSlot);
|
|
16753
|
+
}
|
|
16310
16754
|
}
|
|
16311
|
-
|
|
16312
|
-
|
|
16313
|
-
|
|
16314
|
-
|
|
16315
|
-
|
|
16316
|
-
|
|
16317
|
-
|
|
16318
|
-
|
|
16319
|
-
|
|
16320
|
-
|
|
16755
|
+
})
|
|
16756
|
+
);
|
|
16757
|
+
const settled = await Promise.allSettled(promises);
|
|
16758
|
+
const results = [];
|
|
16759
|
+
for (let i = 0; i < settled.length; i++) {
|
|
16760
|
+
const outcome = settled[i];
|
|
16761
|
+
if (outcome.status === "fulfilled") {
|
|
16762
|
+
results.push(outcome.value);
|
|
16763
|
+
} else {
|
|
16764
|
+
const evalCase = filteredEvalCases[i];
|
|
16765
|
+
const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
|
|
16766
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
16767
|
+
const errorResult = buildErrorResult(
|
|
16768
|
+
evalCase,
|
|
16769
|
+
target.name,
|
|
16770
|
+
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
16771
|
+
outcome.reason,
|
|
16772
|
+
promptInputs,
|
|
16773
|
+
primaryProvider,
|
|
16774
|
+
"agent",
|
|
16775
|
+
"provider_error"
|
|
16776
|
+
);
|
|
16777
|
+
results.push(errorResult);
|
|
16778
|
+
if (onResult) {
|
|
16779
|
+
await onResult(errorResult);
|
|
16321
16780
|
}
|
|
16322
|
-
throw error;
|
|
16323
16781
|
}
|
|
16324
|
-
}
|
|
16325
|
-
|
|
16326
|
-
|
|
16327
|
-
|
|
16328
|
-
|
|
16329
|
-
|
|
16330
|
-
|
|
16331
|
-
|
|
16332
|
-
|
|
16333
|
-
|
|
16334
|
-
|
|
16335
|
-
|
|
16336
|
-
|
|
16337
|
-
|
|
16338
|
-
|
|
16339
|
-
|
|
16340
|
-
|
|
16341
|
-
|
|
16342
|
-
|
|
16343
|
-
|
|
16344
|
-
|
|
16345
|
-
|
|
16346
|
-
|
|
16347
|
-
if (onResult) {
|
|
16348
|
-
await onResult(errorResult);
|
|
16782
|
+
}
|
|
16783
|
+
const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
|
|
16784
|
+
const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all_tests;
|
|
16785
|
+
if (afterAllWorkspaces.length > 0 && hasHookCommand(suiteAfterAllHook)) {
|
|
16786
|
+
const afterAllHook = suiteAfterAllHook;
|
|
16787
|
+
for (const wsPath of afterAllWorkspaces) {
|
|
16788
|
+
const scriptContext = {
|
|
16789
|
+
workspacePath: wsPath,
|
|
16790
|
+
testId: "__after_all__",
|
|
16791
|
+
evalRunId,
|
|
16792
|
+
evalDir
|
|
16793
|
+
};
|
|
16794
|
+
try {
|
|
16795
|
+
const afterAllOutput = await executeWorkspaceScript(
|
|
16796
|
+
toScriptConfig(afterAllHook, "after_all_tests", "suite workspace"),
|
|
16797
|
+
scriptContext,
|
|
16798
|
+
"warn"
|
|
16799
|
+
);
|
|
16800
|
+
if (afterAllOutput && results.length > 0 && wsPath === afterAllWorkspaces[0]) {
|
|
16801
|
+
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
16802
|
+
}
|
|
16803
|
+
} catch {
|
|
16804
|
+
}
|
|
16349
16805
|
}
|
|
16350
16806
|
}
|
|
16351
|
-
|
|
16352
|
-
|
|
16353
|
-
|
|
16354
|
-
|
|
16355
|
-
|
|
16356
|
-
|
|
16357
|
-
|
|
16358
|
-
|
|
16359
|
-
|
|
16360
|
-
|
|
16361
|
-
suiteWorkspace.after_all,
|
|
16362
|
-
scriptContext,
|
|
16363
|
-
"warn"
|
|
16364
|
-
);
|
|
16365
|
-
if (afterAllOutput && results.length > 0) {
|
|
16366
|
-
results[results.length - 1] = { ...results[results.length - 1], afterAllOutput };
|
|
16807
|
+
if (sharedWorkspacePath && !poolSlot && poolSlots.length === 0 && !useStaticWorkspace) {
|
|
16808
|
+
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
16809
|
+
if (hasFailure) {
|
|
16810
|
+
if (resolvedRetainOnFailure === "cleanup") {
|
|
16811
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16812
|
+
});
|
|
16813
|
+
}
|
|
16814
|
+
} else if (resolvedRetainOnSuccess === "cleanup") {
|
|
16815
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16816
|
+
});
|
|
16367
16817
|
}
|
|
16368
|
-
} catch {
|
|
16369
16818
|
}
|
|
16370
|
-
}
|
|
16371
|
-
if (sharedWorkspacePath) {
|
|
16372
|
-
const hasFailure = results.some((r) => !!r.error || r.score < 0.5);
|
|
16373
16819
|
if (cleanupWorkspaces) {
|
|
16374
|
-
await
|
|
16375
|
-
});
|
|
16376
|
-
} else if (!hasFailure && !keepWorkspaces) {
|
|
16377
|
-
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16820
|
+
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
16378
16821
|
});
|
|
16379
16822
|
}
|
|
16823
|
+
return results;
|
|
16824
|
+
} finally {
|
|
16825
|
+
if (poolManager) {
|
|
16826
|
+
if (poolSlot) {
|
|
16827
|
+
await poolManager.releaseSlot(poolSlot);
|
|
16828
|
+
}
|
|
16829
|
+
for (const slot of poolSlots) {
|
|
16830
|
+
if (slot !== poolSlot) {
|
|
16831
|
+
await poolManager.releaseSlot(slot).catch(() => {
|
|
16832
|
+
});
|
|
16833
|
+
}
|
|
16834
|
+
}
|
|
16835
|
+
}
|
|
16380
16836
|
}
|
|
16381
|
-
if (cleanupWorkspaces) {
|
|
16382
|
-
await cleanupEvalWorkspaces(evalRunId).catch(() => {
|
|
16383
|
-
});
|
|
16384
|
-
}
|
|
16385
|
-
return results;
|
|
16386
16837
|
}
|
|
16387
16838
|
async function runBatchEvaluation(options) {
|
|
16388
16839
|
const {
|
|
@@ -16554,6 +17005,8 @@ async function runEvalCase(options) {
|
|
|
16554
17005
|
evalRunId,
|
|
16555
17006
|
keepWorkspaces,
|
|
16556
17007
|
cleanupWorkspaces: forceCleanup,
|
|
17008
|
+
retainOnSuccess,
|
|
17009
|
+
retainOnFailure,
|
|
16557
17010
|
sharedWorkspacePath,
|
|
16558
17011
|
sharedBaselineCommit,
|
|
16559
17012
|
suiteWorkspaceFile,
|
|
@@ -16565,10 +17018,10 @@ async function runEvalCase(options) {
|
|
|
16565
17018
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
16566
17019
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
16567
17020
|
const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
|
|
16568
|
-
const
|
|
17021
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
16569
17022
|
let cachedResponse;
|
|
16570
|
-
if (
|
|
16571
|
-
cachedResponse = await cache.get(
|
|
17023
|
+
if (cacheKey && cache) {
|
|
17024
|
+
cachedResponse = await cache.get(cacheKey);
|
|
16572
17025
|
}
|
|
16573
17026
|
const nowFn = now ?? (() => /* @__PURE__ */ new Date());
|
|
16574
17027
|
let workspacePath = sharedWorkspacePath;
|
|
@@ -16599,7 +17052,7 @@ async function runEvalCase(options) {
|
|
|
16599
17052
|
);
|
|
16600
17053
|
}
|
|
16601
17054
|
if (caseWorkspaceFile && workspacePath) {
|
|
16602
|
-
const copiedFile =
|
|
17055
|
+
const copiedFile = import_node_path42.default.join(workspacePath, import_node_path42.default.basename(caseWorkspaceFile));
|
|
16603
17056
|
try {
|
|
16604
17057
|
await (0, import_promises29.stat)(copiedFile);
|
|
16605
17058
|
caseWorkspaceFile = copiedFile;
|
|
@@ -16607,12 +17060,12 @@ async function runEvalCase(options) {
|
|
|
16607
17060
|
}
|
|
16608
17061
|
}
|
|
16609
17062
|
}
|
|
16610
|
-
if (!workspacePath && (evalCase.workspace?.
|
|
17063
|
+
if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
|
|
16611
17064
|
workspacePath = getWorkspacePath(evalRunId, evalCase.id);
|
|
16612
17065
|
await (0, import_promises29.mkdir)(workspacePath, { recursive: true });
|
|
16613
17066
|
}
|
|
16614
17067
|
if (evalCase.workspace?.repos?.length && workspacePath) {
|
|
16615
|
-
const perCaseRepoManager = new RepoManager(
|
|
17068
|
+
const perCaseRepoManager = new RepoManager(setupDebug);
|
|
16616
17069
|
try {
|
|
16617
17070
|
if (setupDebug) {
|
|
16618
17071
|
console.log(
|
|
@@ -16637,11 +17090,13 @@ async function runEvalCase(options) {
|
|
|
16637
17090
|
);
|
|
16638
17091
|
}
|
|
16639
17092
|
}
|
|
16640
|
-
|
|
16641
|
-
|
|
17093
|
+
const caseBeforeAllHook = evalCase.workspace?.hooks?.before_all_tests;
|
|
17094
|
+
if (workspacePath && hasHookCommand(caseBeforeAllHook)) {
|
|
17095
|
+
const beforeAllHook = caseBeforeAllHook;
|
|
17096
|
+
const beforeAllCommand = (beforeAllHook.command ?? beforeAllHook.script ?? []).join(" ");
|
|
16642
17097
|
if (setupDebug) {
|
|
16643
17098
|
console.log(
|
|
16644
|
-
`[setup] test=${evalCase.id} running before_all in cwd=${
|
|
17099
|
+
`[setup] test=${evalCase.id} running before_all in cwd=${beforeAllHook.cwd ?? evalDir} command=${beforeAllCommand}`
|
|
16645
17100
|
);
|
|
16646
17101
|
}
|
|
16647
17102
|
const scriptContext = {
|
|
@@ -16654,7 +17109,7 @@ async function runEvalCase(options) {
|
|
|
16654
17109
|
};
|
|
16655
17110
|
try {
|
|
16656
17111
|
beforeAllOutput = await executeWorkspaceScript(
|
|
16657
|
-
evalCase.
|
|
17112
|
+
toScriptConfig(beforeAllHook, "before_all_tests", `test '${evalCase.id}'`),
|
|
16658
17113
|
scriptContext
|
|
16659
17114
|
);
|
|
16660
17115
|
if (setupDebug) {
|
|
@@ -16679,7 +17134,9 @@ async function runEvalCase(options) {
|
|
|
16679
17134
|
}
|
|
16680
17135
|
}
|
|
16681
17136
|
}
|
|
16682
|
-
|
|
17137
|
+
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each_test;
|
|
17138
|
+
if (workspacePath && hasHookCommand(caseBeforeEachHook)) {
|
|
17139
|
+
const beforeEachHook = caseBeforeEachHook;
|
|
16683
17140
|
const scriptContext = {
|
|
16684
17141
|
workspacePath,
|
|
16685
17142
|
testId: evalCase.id,
|
|
@@ -16690,7 +17147,7 @@ async function runEvalCase(options) {
|
|
|
16690
17147
|
};
|
|
16691
17148
|
try {
|
|
16692
17149
|
beforeEachOutput = await executeWorkspaceScript(
|
|
16693
|
-
evalCase.
|
|
17150
|
+
toScriptConfig(beforeEachHook, "before_each_test", `test '${evalCase.id}'`),
|
|
16694
17151
|
scriptContext
|
|
16695
17152
|
);
|
|
16696
17153
|
} catch (error) {
|
|
@@ -16778,8 +17235,8 @@ async function runEvalCase(options) {
|
|
|
16778
17235
|
}
|
|
16779
17236
|
return errorResult;
|
|
16780
17237
|
}
|
|
16781
|
-
if (
|
|
16782
|
-
await cache.set(
|
|
17238
|
+
if (cacheKey && cache && !cachedResponse) {
|
|
17239
|
+
await cache.set(cacheKey, providerResponse);
|
|
16783
17240
|
}
|
|
16784
17241
|
const output = providerResponse.output;
|
|
16785
17242
|
const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
|
|
@@ -16807,17 +17264,19 @@ async function runEvalCase(options) {
|
|
|
16807
17264
|
}
|
|
16808
17265
|
}
|
|
16809
17266
|
const providerError = extractProviderError(providerResponse);
|
|
16810
|
-
if (repoManager && workspacePath && evalCase.workspace?.reset
|
|
17267
|
+
if (repoManager && workspacePath && evalCase.workspace?.hooks?.after_each_test?.reset && evalCase.workspace.hooks.after_each_test.reset !== "none" && evalCase.workspace.repos) {
|
|
16811
17268
|
try {
|
|
16812
17269
|
await repoManager.reset(
|
|
16813
17270
|
evalCase.workspace.repos,
|
|
16814
17271
|
workspacePath,
|
|
16815
|
-
evalCase.workspace.reset
|
|
17272
|
+
evalCase.workspace.hooks.after_each_test.reset
|
|
16816
17273
|
);
|
|
16817
17274
|
} catch {
|
|
16818
17275
|
}
|
|
16819
17276
|
}
|
|
16820
|
-
|
|
17277
|
+
const caseAfterEachHook = evalCase.workspace?.hooks?.after_each_test;
|
|
17278
|
+
if (workspacePath && hasHookCommand(caseAfterEachHook)) {
|
|
17279
|
+
const afterEachHook = caseAfterEachHook;
|
|
16821
17280
|
const scriptContext = {
|
|
16822
17281
|
workspacePath,
|
|
16823
17282
|
testId: evalCase.id,
|
|
@@ -16828,7 +17287,7 @@ async function runEvalCase(options) {
|
|
|
16828
17287
|
};
|
|
16829
17288
|
try {
|
|
16830
17289
|
afterEachOutput = await executeWorkspaceScript(
|
|
16831
|
-
evalCase.
|
|
17290
|
+
toScriptConfig(afterEachHook, "after_each_test", `test '${evalCase.id}'`),
|
|
16832
17291
|
scriptContext,
|
|
16833
17292
|
"warn"
|
|
16834
17293
|
);
|
|
@@ -16878,8 +17337,13 @@ async function runEvalCase(options) {
|
|
|
16878
17337
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
16879
17338
|
});
|
|
16880
17339
|
} else if (isFailure) {
|
|
16881
|
-
|
|
16882
|
-
|
|
17340
|
+
if ((retainOnFailure ?? "keep") === "cleanup") {
|
|
17341
|
+
await cleanupWorkspace(workspacePath).catch(() => {
|
|
17342
|
+
});
|
|
17343
|
+
} else {
|
|
17344
|
+
return { ...finalResult, workspacePath };
|
|
17345
|
+
}
|
|
17346
|
+
} else if ((retainOnSuccess ?? (keepWorkspaces ? "keep" : "cleanup")) !== "keep") {
|
|
16883
17347
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
16884
17348
|
});
|
|
16885
17349
|
}
|
|
@@ -16897,11 +17361,12 @@ async function runEvalCase(options) {
|
|
|
16897
17361
|
"evaluator_error"
|
|
16898
17362
|
);
|
|
16899
17363
|
if (workspacePath && !isSharedWorkspace) {
|
|
16900
|
-
if (forceCleanup) {
|
|
17364
|
+
if (forceCleanup || (retainOnFailure ?? "keep") === "cleanup") {
|
|
16901
17365
|
await cleanupWorkspace(workspacePath).catch(() => {
|
|
16902
17366
|
});
|
|
17367
|
+
} else {
|
|
17368
|
+
return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
|
|
16903
17369
|
}
|
|
16904
|
-
return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
|
|
16905
17370
|
}
|
|
16906
17371
|
return { ...errorResult, beforeEachOutput, afterEachOutput };
|
|
16907
17372
|
}
|
|
@@ -16920,7 +17385,9 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
16920
17385
|
useCache: false,
|
|
16921
17386
|
// Force cleanup for intermediate trials
|
|
16922
17387
|
cleanupWorkspaces: isLastDeclaredTrial ? options.cleanupWorkspaces : true,
|
|
16923
|
-
keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false
|
|
17388
|
+
keepWorkspaces: isLastDeclaredTrial ? options.keepWorkspaces : false,
|
|
17389
|
+
retainOnSuccess: isLastDeclaredTrial ? options.retainOnSuccess : "cleanup",
|
|
17390
|
+
retainOnFailure: isLastDeclaredTrial ? options.retainOnFailure : "cleanup"
|
|
16924
17391
|
};
|
|
16925
17392
|
const result = await runEvalCase(trialOptions);
|
|
16926
17393
|
allResults.push(result);
|
|
@@ -17209,7 +17676,7 @@ async function runEvaluatorList(options) {
|
|
|
17209
17676
|
fileChanges,
|
|
17210
17677
|
workspacePath
|
|
17211
17678
|
};
|
|
17212
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
17679
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path42.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
17213
17680
|
const dispatchContext = {
|
|
17214
17681
|
judgeProvider,
|
|
17215
17682
|
targetResolver,
|
|
@@ -17512,7 +17979,7 @@ function computeWeightedMean(entries) {
|
|
|
17512
17979
|
|
|
17513
17980
|
// src/evaluation/evaluate.ts
|
|
17514
17981
|
var import_node_fs12 = require("fs");
|
|
17515
|
-
var
|
|
17982
|
+
var import_node_path43 = __toESM(require("path"), 1);
|
|
17516
17983
|
async function evaluate(config) {
|
|
17517
17984
|
const startTime = Date.now();
|
|
17518
17985
|
if (config.tests && config.specFile) {
|
|
@@ -17534,13 +18001,13 @@ async function evaluate(config) {
|
|
|
17534
18001
|
let evalCases;
|
|
17535
18002
|
let testFilePath;
|
|
17536
18003
|
if (config.specFile) {
|
|
17537
|
-
testFilePath =
|
|
18004
|
+
testFilePath = import_node_path43.default.resolve(config.specFile);
|
|
17538
18005
|
evalCases = await loadTests(testFilePath, repoRoot, {
|
|
17539
18006
|
verbose: config.verbose,
|
|
17540
18007
|
filter: config.filter
|
|
17541
18008
|
});
|
|
17542
18009
|
} else {
|
|
17543
|
-
testFilePath =
|
|
18010
|
+
testFilePath = import_node_path43.default.join(process.cwd(), "__programmatic__.yaml");
|
|
17544
18011
|
evalCases = (config.tests ?? []).map((test) => {
|
|
17545
18012
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
17546
18013
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
@@ -17626,10 +18093,10 @@ function computeSummary(results, durationMs) {
|
|
|
17626
18093
|
var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
|
|
17627
18094
|
async function discoverDefaultTarget(repoRoot) {
|
|
17628
18095
|
const cwd = process.cwd();
|
|
17629
|
-
const chain = buildDirectoryChain2(
|
|
18096
|
+
const chain = buildDirectoryChain2(import_node_path43.default.join(cwd, "_placeholder"), repoRoot);
|
|
17630
18097
|
for (const dir of chain) {
|
|
17631
18098
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
17632
|
-
const targetsPath =
|
|
18099
|
+
const targetsPath = import_node_path43.default.join(dir, candidate);
|
|
17633
18100
|
if (!(0, import_node_fs12.existsSync)(targetsPath)) continue;
|
|
17634
18101
|
try {
|
|
17635
18102
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
@@ -17644,10 +18111,10 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
17644
18111
|
async function loadEnvHierarchy(repoRoot) {
|
|
17645
18112
|
const { readFileSync: readFileSync2 } = await import("fs");
|
|
17646
18113
|
const cwd = process.cwd();
|
|
17647
|
-
const chain = buildDirectoryChain2(
|
|
18114
|
+
const chain = buildDirectoryChain2(import_node_path43.default.join(cwd, "_placeholder"), repoRoot);
|
|
17648
18115
|
const envFiles = [];
|
|
17649
18116
|
for (const dir of chain) {
|
|
17650
|
-
const envPath =
|
|
18117
|
+
const envPath = import_node_path43.default.join(dir, ".env");
|
|
17651
18118
|
if ((0, import_node_fs12.existsSync)(envPath)) envFiles.push(envPath);
|
|
17652
18119
|
}
|
|
17653
18120
|
for (let i = envFiles.length - 1; i >= 0; i--) {
|
|
@@ -17829,7 +18296,7 @@ function buildPrompt(criteria, question, referenceAnswer) {
|
|
|
17829
18296
|
|
|
17830
18297
|
// src/evaluation/cache/response-cache.ts
|
|
17831
18298
|
var import_promises30 = require("fs/promises");
|
|
17832
|
-
var
|
|
18299
|
+
var import_node_path44 = __toESM(require("path"), 1);
|
|
17833
18300
|
var DEFAULT_CACHE_PATH = ".agentv/cache";
|
|
17834
18301
|
var ResponseCache = class {
|
|
17835
18302
|
cachePath;
|
|
@@ -17847,13 +18314,13 @@ var ResponseCache = class {
|
|
|
17847
18314
|
}
|
|
17848
18315
|
async set(key, value) {
|
|
17849
18316
|
const filePath = this.keyToPath(key);
|
|
17850
|
-
const dir =
|
|
18317
|
+
const dir = import_node_path44.default.dirname(filePath);
|
|
17851
18318
|
await (0, import_promises30.mkdir)(dir, { recursive: true });
|
|
17852
18319
|
await (0, import_promises30.writeFile)(filePath, JSON.stringify(value, null, 2), "utf8");
|
|
17853
18320
|
}
|
|
17854
18321
|
keyToPath(key) {
|
|
17855
18322
|
const prefix = key.slice(0, 2);
|
|
17856
|
-
return
|
|
18323
|
+
return import_node_path44.default.join(this.cachePath, prefix, `${key}.json`);
|
|
17857
18324
|
}
|
|
17858
18325
|
};
|
|
17859
18326
|
function shouldEnableCache(params) {
|
|
@@ -18340,6 +18807,7 @@ function createAgentKernel() {
|
|
|
18340
18807
|
TokenUsageEvaluator,
|
|
18341
18808
|
ToolTrajectoryEvaluator,
|
|
18342
18809
|
WorkspaceCreationError,
|
|
18810
|
+
WorkspacePoolManager,
|
|
18343
18811
|
assembleLlmJudgePrompt,
|
|
18344
18812
|
avgToolDurationMs,
|
|
18345
18813
|
buildDirectoryChain,
|
|
@@ -18354,6 +18822,7 @@ function createAgentKernel() {
|
|
|
18354
18822
|
cleanupEvalWorkspaces,
|
|
18355
18823
|
cleanupWorkspace,
|
|
18356
18824
|
computeTraceSummary,
|
|
18825
|
+
computeWorkspaceFingerprint,
|
|
18357
18826
|
consumeClaudeLogEntries,
|
|
18358
18827
|
consumeCodexLogEntries,
|
|
18359
18828
|
consumeCopilotCliLogEntries,
|
|
@@ -18386,11 +18855,11 @@ function createAgentKernel() {
|
|
|
18386
18855
|
freeformEvaluationSchema,
|
|
18387
18856
|
generateRubrics,
|
|
18388
18857
|
getAgentvHome,
|
|
18389
|
-
getGitCacheRoot,
|
|
18390
18858
|
getHitCount,
|
|
18391
18859
|
getSubagentsRoot,
|
|
18392
18860
|
getTraceStateRoot,
|
|
18393
18861
|
getWorkspacePath,
|
|
18862
|
+
getWorkspacePoolRoot,
|
|
18394
18863
|
getWorkspacesRoot,
|
|
18395
18864
|
initializeBaseline,
|
|
18396
18865
|
isEvaluatorKind,
|