@agentv/core 2.7.1-next.5 → 2.8.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6W5E3VR6.js → chunk-P2465XAH.js} +24 -49
- package/dist/chunk-P2465XAH.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +28 -58
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +21 -44
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +295 -220
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +44 -42
- package/dist/index.d.ts +44 -42
- package/dist/index.js +273 -173
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-6W5E3VR6.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-P2465XAH.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -538,6 +538,24 @@ function extractCacheConfig(suite) {
|
|
|
538
538
|
const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
|
|
539
539
|
return { enabled: cache, cachePath: resolvedCachePath };
|
|
540
540
|
}
|
|
541
|
+
function extractTotalBudgetUsd(suite) {
|
|
542
|
+
const execution = suite.execution;
|
|
543
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
544
|
+
return void 0;
|
|
545
|
+
}
|
|
546
|
+
const executionObj = execution;
|
|
547
|
+
const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
|
|
548
|
+
if (rawBudget === void 0 || rawBudget === null) {
|
|
549
|
+
return void 0;
|
|
550
|
+
}
|
|
551
|
+
if (typeof rawBudget === "number" && rawBudget > 0) {
|
|
552
|
+
return rawBudget;
|
|
553
|
+
}
|
|
554
|
+
logWarning(
|
|
555
|
+
`Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
|
|
556
|
+
);
|
|
557
|
+
return void 0;
|
|
558
|
+
}
|
|
541
559
|
function logWarning(message) {
|
|
542
560
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
543
561
|
}
|
|
@@ -670,24 +688,24 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
670
688
|
continue;
|
|
671
689
|
}
|
|
672
690
|
if (typeValue === "code_judge") {
|
|
673
|
-
let
|
|
674
|
-
const
|
|
675
|
-
if (typeof
|
|
676
|
-
const trimmed =
|
|
691
|
+
let command;
|
|
692
|
+
const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
|
|
693
|
+
if (typeof rawCommand === "string") {
|
|
694
|
+
const trimmed = rawCommand.trim();
|
|
677
695
|
if (trimmed.length === 0) {
|
|
678
696
|
throw new Error(
|
|
679
|
-
`Invalid code_judge
|
|
697
|
+
`Invalid code_judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
|
|
680
698
|
);
|
|
681
699
|
}
|
|
682
|
-
|
|
700
|
+
command = parseCommandToArgv(trimmed);
|
|
683
701
|
} else {
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
`code_judge
|
|
702
|
+
command = asStringArray(
|
|
703
|
+
rawCommand,
|
|
704
|
+
`code_judge command for evaluator '${name}' in '${evalId}'`
|
|
687
705
|
);
|
|
688
706
|
}
|
|
689
|
-
if (!
|
|
690
|
-
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing
|
|
707
|
+
if (!command) {
|
|
708
|
+
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing command`);
|
|
691
709
|
continue;
|
|
692
710
|
}
|
|
693
711
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
@@ -732,6 +750,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
732
750
|
const knownProps2 = /* @__PURE__ */ new Set([
|
|
733
751
|
"name",
|
|
734
752
|
"type",
|
|
753
|
+
"command",
|
|
735
754
|
"script",
|
|
736
755
|
"cwd",
|
|
737
756
|
"weight",
|
|
@@ -748,7 +767,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
748
767
|
evaluators.push({
|
|
749
768
|
name,
|
|
750
769
|
type: "code",
|
|
751
|
-
|
|
770
|
+
command,
|
|
752
771
|
cwd,
|
|
753
772
|
resolvedCwd,
|
|
754
773
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
@@ -1350,20 +1369,20 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1350
1369
|
let resolvedPromptScript;
|
|
1351
1370
|
let promptScriptConfig;
|
|
1352
1371
|
if (isJsonObject2(rawPrompt)) {
|
|
1353
|
-
const
|
|
1354
|
-
rawPrompt.script,
|
|
1355
|
-
`prompt.
|
|
1372
|
+
const commandArray = asStringArray(
|
|
1373
|
+
rawPrompt.command ?? rawPrompt.script,
|
|
1374
|
+
`prompt.command for evaluator '${name}' in '${evalId}'`
|
|
1356
1375
|
);
|
|
1357
|
-
if (!
|
|
1358
|
-
throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires
|
|
1376
|
+
if (!commandArray) {
|
|
1377
|
+
throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires command array`);
|
|
1359
1378
|
}
|
|
1360
|
-
const
|
|
1361
|
-
const resolved = await resolveFileReference3(
|
|
1379
|
+
const commandPath = commandArray[commandArray.length - 1];
|
|
1380
|
+
const resolved = await resolveFileReference3(commandPath, searchRoots);
|
|
1362
1381
|
if (resolved.resolvedPath) {
|
|
1363
|
-
resolvedPromptScript = [...
|
|
1382
|
+
resolvedPromptScript = [...commandArray.slice(0, -1), path4.resolve(resolved.resolvedPath)];
|
|
1364
1383
|
} else {
|
|
1365
1384
|
throw new Error(
|
|
1366
|
-
`Evaluator '${name}' in '${evalId}': prompt
|
|
1385
|
+
`Evaluator '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
|
|
1367
1386
|
);
|
|
1368
1387
|
}
|
|
1369
1388
|
if (isJsonObject2(rawPrompt.config)) {
|
|
@@ -2594,6 +2613,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2594
2613
|
trials: extractTrialsConfig(parsed),
|
|
2595
2614
|
targets: extractTargetsFromSuite(parsed),
|
|
2596
2615
|
cacheConfig: extractCacheConfig(parsed),
|
|
2616
|
+
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
2597
2617
|
...metadata !== void 0 && { metadata }
|
|
2598
2618
|
};
|
|
2599
2619
|
}
|
|
@@ -2784,16 +2804,16 @@ var loadEvalCaseById = loadTestById;
|
|
|
2784
2804
|
function parseWorkspaceScriptConfig(raw, evalFileDir) {
|
|
2785
2805
|
if (!isJsonObject(raw)) return void 0;
|
|
2786
2806
|
const obj = raw;
|
|
2787
|
-
const
|
|
2788
|
-
if (!Array.isArray(
|
|
2789
|
-
const
|
|
2790
|
-
if (
|
|
2807
|
+
const commandSource = obj.command ?? obj.script;
|
|
2808
|
+
if (!Array.isArray(commandSource) || commandSource.length === 0) return void 0;
|
|
2809
|
+
const commandArr = commandSource.filter((s) => typeof s === "string");
|
|
2810
|
+
if (commandArr.length === 0) return void 0;
|
|
2791
2811
|
const timeoutMs = typeof obj.timeout_ms === "number" ? obj.timeout_ms : void 0;
|
|
2792
2812
|
let cwd = typeof obj.cwd === "string" ? obj.cwd : void 0;
|
|
2793
2813
|
if (cwd && !path8.isAbsolute(cwd)) {
|
|
2794
2814
|
cwd = path8.resolve(evalFileDir, cwd);
|
|
2795
2815
|
}
|
|
2796
|
-
const config = {
|
|
2816
|
+
const config = { command: commandArr };
|
|
2797
2817
|
if (timeoutMs !== void 0) {
|
|
2798
2818
|
return { ...config, timeout_ms: timeoutMs, ...cwd !== void 0 && { cwd } };
|
|
2799
2819
|
}
|
|
@@ -3871,50 +3891,58 @@ var CliProvider = class {
|
|
|
3871
3891
|
await this.ensureHealthy(request.signal);
|
|
3872
3892
|
const effectiveCwd = request.cwd ?? this.config.cwd;
|
|
3873
3893
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
3874
|
-
const templateValues = buildTemplateValues(
|
|
3875
|
-
|
|
3894
|
+
const { values: templateValues, promptFilePath } = await buildTemplateValues(
|
|
3895
|
+
request,
|
|
3896
|
+
this.config,
|
|
3897
|
+
outputFilePath
|
|
3898
|
+
);
|
|
3899
|
+
const renderedCommand = renderTemplate(this.config.command, templateValues);
|
|
3876
3900
|
if (this.verbose) {
|
|
3877
3901
|
console.log(
|
|
3878
3902
|
`[cli-provider:${this.targetName}] cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
|
|
3879
3903
|
);
|
|
3880
3904
|
}
|
|
3881
|
-
|
|
3882
|
-
|
|
3883
|
-
|
|
3884
|
-
env: process.env,
|
|
3885
|
-
timeoutMs: this.config.timeoutMs,
|
|
3886
|
-
signal: request.signal
|
|
3887
|
-
});
|
|
3888
|
-
const measuredDurationMs = Date.now() - startTime;
|
|
3889
|
-
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
3890
|
-
if (request.signal?.aborted) {
|
|
3891
|
-
throw new Error("CLI provider request was aborted");
|
|
3892
|
-
}
|
|
3893
|
-
if (result.timedOut) {
|
|
3894
|
-
throw new Error(
|
|
3895
|
-
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
3896
|
-
);
|
|
3897
|
-
}
|
|
3898
|
-
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
3899
|
-
const detail = result.stderr.trim() || result.stdout.trim();
|
|
3900
|
-
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
3901
|
-
throw new Error(message);
|
|
3902
|
-
}
|
|
3903
|
-
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
3904
|
-
const parsed = this.parseOutputContent(responseContent);
|
|
3905
|
-
return {
|
|
3906
|
-
output: parsed.output,
|
|
3907
|
-
tokenUsage: parsed.tokenUsage,
|
|
3908
|
-
costUsd: parsed.costUsd,
|
|
3909
|
-
durationMs: parsed.durationMs ?? measuredDurationMs,
|
|
3910
|
-
raw: {
|
|
3911
|
-
command: renderedCommand,
|
|
3912
|
-
stderr: result.stderr,
|
|
3913
|
-
exitCode: result.exitCode ?? 0,
|
|
3905
|
+
try {
|
|
3906
|
+
const startTime = Date.now();
|
|
3907
|
+
const result = await this.runCommand(renderedCommand, {
|
|
3914
3908
|
cwd: effectiveCwd,
|
|
3915
|
-
|
|
3909
|
+
env: process.env,
|
|
3910
|
+
timeoutMs: this.config.timeoutMs,
|
|
3911
|
+
signal: request.signal
|
|
3912
|
+
});
|
|
3913
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
3914
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
3915
|
+
if (request.signal?.aborted) {
|
|
3916
|
+
throw new Error("CLI provider request was aborted");
|
|
3917
|
+
}
|
|
3918
|
+
if (result.timedOut) {
|
|
3919
|
+
throw new Error(
|
|
3920
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
3921
|
+
);
|
|
3922
|
+
}
|
|
3923
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
3924
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
3925
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
3926
|
+
throw new Error(message);
|
|
3916
3927
|
}
|
|
3917
|
-
|
|
3928
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
3929
|
+
const parsed = this.parseOutputContent(responseContent);
|
|
3930
|
+
return {
|
|
3931
|
+
output: parsed.output,
|
|
3932
|
+
tokenUsage: parsed.tokenUsage,
|
|
3933
|
+
costUsd: parsed.costUsd,
|
|
3934
|
+
durationMs: parsed.durationMs ?? measuredDurationMs,
|
|
3935
|
+
raw: {
|
|
3936
|
+
command: renderedCommand,
|
|
3937
|
+
stderr: result.stderr,
|
|
3938
|
+
exitCode: result.exitCode ?? 0,
|
|
3939
|
+
cwd: effectiveCwd,
|
|
3940
|
+
outputFile: outputFilePath
|
|
3941
|
+
}
|
|
3942
|
+
};
|
|
3943
|
+
} finally {
|
|
3944
|
+
await cleanupTempFile(promptFilePath, this.keepTempFiles);
|
|
3945
|
+
}
|
|
3918
3946
|
}
|
|
3919
3947
|
async invokeBatch(requests) {
|
|
3920
3948
|
if (requests.length === 0) {
|
|
@@ -3937,7 +3965,7 @@ var CliProvider = class {
|
|
|
3937
3965
|
batchInputFiles.push(...request.inputFiles);
|
|
3938
3966
|
}
|
|
3939
3967
|
}
|
|
3940
|
-
const templateValues = buildTemplateValues(
|
|
3968
|
+
const { values: templateValues, promptFilePath } = await buildTemplateValues(
|
|
3941
3969
|
{
|
|
3942
3970
|
question: "",
|
|
3943
3971
|
guidelines: "",
|
|
@@ -3948,87 +3976,91 @@ var CliProvider = class {
|
|
|
3948
3976
|
this.config,
|
|
3949
3977
|
outputFilePath
|
|
3950
3978
|
);
|
|
3951
|
-
const renderedCommand = renderTemplate(this.config.
|
|
3979
|
+
const renderedCommand = renderTemplate(this.config.command, templateValues);
|
|
3952
3980
|
if (this.verbose) {
|
|
3953
3981
|
console.log(
|
|
3954
3982
|
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
3955
3983
|
);
|
|
3956
3984
|
}
|
|
3957
|
-
|
|
3958
|
-
|
|
3959
|
-
|
|
3960
|
-
|
|
3961
|
-
|
|
3962
|
-
|
|
3963
|
-
|
|
3964
|
-
|
|
3965
|
-
|
|
3966
|
-
if (
|
|
3967
|
-
|
|
3968
|
-
|
|
3969
|
-
|
|
3970
|
-
|
|
3971
|
-
|
|
3972
|
-
|
|
3973
|
-
|
|
3974
|
-
|
|
3975
|
-
|
|
3976
|
-
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
3980
|
-
|
|
3981
|
-
|
|
3982
|
-
|
|
3983
|
-
const
|
|
3984
|
-
|
|
3985
|
-
|
|
3986
|
-
|
|
3987
|
-
|
|
3988
|
-
|
|
3989
|
-
|
|
3990
|
-
|
|
3991
|
-
|
|
3992
|
-
|
|
3993
|
-
|
|
3985
|
+
try {
|
|
3986
|
+
const startTime = Date.now();
|
|
3987
|
+
const result = await this.runCommand(renderedCommand, {
|
|
3988
|
+
cwd: this.config.cwd,
|
|
3989
|
+
env: process.env,
|
|
3990
|
+
timeoutMs: this.config.timeoutMs,
|
|
3991
|
+
signal: controller.signal
|
|
3992
|
+
});
|
|
3993
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
3994
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
3995
|
+
if (controller.signal.aborted) {
|
|
3996
|
+
throw new Error("CLI provider request was aborted");
|
|
3997
|
+
}
|
|
3998
|
+
if (result.timedOut) {
|
|
3999
|
+
throw new Error(
|
|
4000
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
4001
|
+
);
|
|
4002
|
+
}
|
|
4003
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
4004
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
4005
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
4006
|
+
throw new Error(message);
|
|
4007
|
+
}
|
|
4008
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
4009
|
+
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
4010
|
+
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
4011
|
+
const responses = requests.map((request) => {
|
|
4012
|
+
const evalCaseId = request.evalCaseId;
|
|
4013
|
+
if (!evalCaseId) {
|
|
4014
|
+
return {
|
|
4015
|
+
output: [],
|
|
4016
|
+
durationMs: perRequestFallbackMs,
|
|
4017
|
+
raw: {
|
|
4018
|
+
command: renderedCommand,
|
|
4019
|
+
stderr: result.stderr,
|
|
4020
|
+
exitCode: result.exitCode ?? 0,
|
|
4021
|
+
cwd: this.config.cwd,
|
|
4022
|
+
outputFile: outputFilePath
|
|
4023
|
+
}
|
|
4024
|
+
};
|
|
4025
|
+
}
|
|
4026
|
+
const parsed = recordsById.get(evalCaseId);
|
|
4027
|
+
if (!parsed) {
|
|
4028
|
+
const errorMessage = `Batch output missing id '${evalCaseId}'`;
|
|
4029
|
+
if (this.verbose) {
|
|
4030
|
+
console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
|
|
3994
4031
|
}
|
|
3995
|
-
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
-
|
|
4000
|
-
|
|
4001
|
-
|
|
4032
|
+
return {
|
|
4033
|
+
output: [{ role: "assistant", content: `Error: ${errorMessage}` }],
|
|
4034
|
+
durationMs: perRequestFallbackMs,
|
|
4035
|
+
raw: {
|
|
4036
|
+
command: renderedCommand,
|
|
4037
|
+
stderr: result.stderr,
|
|
4038
|
+
exitCode: result.exitCode ?? 0,
|
|
4039
|
+
cwd: this.config.cwd,
|
|
4040
|
+
outputFile: outputFilePath,
|
|
4041
|
+
error: errorMessage
|
|
4042
|
+
}
|
|
4043
|
+
};
|
|
4002
4044
|
}
|
|
4003
4045
|
return {
|
|
4004
|
-
output:
|
|
4005
|
-
|
|
4046
|
+
output: parsed.output,
|
|
4047
|
+
tokenUsage: parsed.tokenUsage,
|
|
4048
|
+
costUsd: parsed.costUsd,
|
|
4049
|
+
durationMs: parsed.durationMs ?? perRequestFallbackMs,
|
|
4006
4050
|
raw: {
|
|
4007
4051
|
command: renderedCommand,
|
|
4008
4052
|
stderr: result.stderr,
|
|
4009
4053
|
exitCode: result.exitCode ?? 0,
|
|
4010
4054
|
cwd: this.config.cwd,
|
|
4011
4055
|
outputFile: outputFilePath,
|
|
4012
|
-
|
|
4056
|
+
recordId: evalCaseId
|
|
4013
4057
|
}
|
|
4014
4058
|
};
|
|
4015
|
-
}
|
|
4016
|
-
return
|
|
4017
|
-
|
|
4018
|
-
|
|
4019
|
-
|
|
4020
|
-
durationMs: parsed.durationMs ?? perRequestFallbackMs,
|
|
4021
|
-
raw: {
|
|
4022
|
-
command: renderedCommand,
|
|
4023
|
-
stderr: result.stderr,
|
|
4024
|
-
exitCode: result.exitCode ?? 0,
|
|
4025
|
-
cwd: this.config.cwd,
|
|
4026
|
-
outputFile: outputFilePath,
|
|
4027
|
-
recordId: evalCaseId
|
|
4028
|
-
}
|
|
4029
|
-
};
|
|
4030
|
-
});
|
|
4031
|
-
return responses;
|
|
4059
|
+
});
|
|
4060
|
+
return responses;
|
|
4061
|
+
} finally {
|
|
4062
|
+
await cleanupTempFile(promptFilePath, this.keepTempFiles);
|
|
4063
|
+
}
|
|
4032
4064
|
}
|
|
4033
4065
|
/**
|
|
4034
4066
|
* Parse output content from CLI.
|
|
@@ -4143,7 +4175,7 @@ var CliProvider = class {
|
|
|
4143
4175
|
return;
|
|
4144
4176
|
}
|
|
4145
4177
|
const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
|
|
4146
|
-
if (healthcheck
|
|
4178
|
+
if ("url" in healthcheck && healthcheck.url) {
|
|
4147
4179
|
const controller = new AbortController();
|
|
4148
4180
|
const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
|
|
4149
4181
|
signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
@@ -4162,50 +4194,70 @@ var CliProvider = class {
|
|
|
4162
4194
|
}
|
|
4163
4195
|
return;
|
|
4164
4196
|
}
|
|
4165
|
-
const
|
|
4166
|
-
|
|
4167
|
-
|
|
4168
|
-
|
|
4169
|
-
|
|
4170
|
-
|
|
4171
|
-
|
|
4172
|
-
|
|
4173
|
-
|
|
4174
|
-
|
|
4175
|
-
|
|
4176
|
-
|
|
4177
|
-
|
|
4197
|
+
const hcCommand = "command" in healthcheck ? healthcheck.command : void 0;
|
|
4198
|
+
if (!hcCommand) {
|
|
4199
|
+
throw new Error(`CLI healthcheck for '${this.targetName}': 'command' or 'url' is required`);
|
|
4200
|
+
}
|
|
4201
|
+
const { values: templateValues, promptFilePath } = await buildTemplateValues(
|
|
4202
|
+
{
|
|
4203
|
+
question: "",
|
|
4204
|
+
guidelines: "",
|
|
4205
|
+
inputFiles: [],
|
|
4206
|
+
evalCaseId: "healthcheck",
|
|
4207
|
+
attempt: 0
|
|
4208
|
+
},
|
|
4209
|
+
this.config,
|
|
4210
|
+
generateOutputFilePath("healthcheck")
|
|
4178
4211
|
);
|
|
4212
|
+
const renderedCommand = renderTemplate(hcCommand, templateValues);
|
|
4213
|
+
const hcCwd = "cwd" in healthcheck ? healthcheck.cwd : void 0;
|
|
4179
4214
|
if (this.verbose) {
|
|
4180
4215
|
console.log(
|
|
4181
|
-
`[cli-provider:${this.targetName}] (healthcheck) cwd=${
|
|
4216
|
+
`[cli-provider:${this.targetName}] (healthcheck) cwd=${hcCwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
4182
4217
|
);
|
|
4183
4218
|
}
|
|
4184
|
-
|
|
4185
|
-
|
|
4186
|
-
|
|
4187
|
-
|
|
4188
|
-
|
|
4189
|
-
|
|
4190
|
-
|
|
4191
|
-
|
|
4192
|
-
|
|
4193
|
-
|
|
4194
|
-
|
|
4219
|
+
try {
|
|
4220
|
+
const result = await this.runCommand(renderedCommand, {
|
|
4221
|
+
cwd: hcCwd ?? this.config.cwd,
|
|
4222
|
+
env: process.env,
|
|
4223
|
+
timeoutMs,
|
|
4224
|
+
signal
|
|
4225
|
+
});
|
|
4226
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
4227
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
4228
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
4229
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
|
|
4230
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
|
|
4231
|
+
}
|
|
4232
|
+
} finally {
|
|
4233
|
+
await cleanupTempFile(promptFilePath, this.keepTempFiles);
|
|
4195
4234
|
}
|
|
4196
4235
|
}
|
|
4197
4236
|
};
|
|
4198
|
-
function buildTemplateValues(request, config, outputFilePath) {
|
|
4237
|
+
async function buildTemplateValues(request, config, outputFilePath) {
|
|
4199
4238
|
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
4239
|
+
const promptFilePath = generateOutputFilePath(request.evalCaseId, ".prompt.txt");
|
|
4240
|
+
await fs.writeFile(promptFilePath, request.question ?? "", "utf8");
|
|
4200
4241
|
return {
|
|
4201
|
-
|
|
4202
|
-
|
|
4203
|
-
|
|
4204
|
-
|
|
4205
|
-
|
|
4206
|
-
|
|
4242
|
+
values: {
|
|
4243
|
+
PROMPT: shellEscape(request.question ?? ""),
|
|
4244
|
+
PROMPT_FILE: shellEscape(promptFilePath),
|
|
4245
|
+
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
4246
|
+
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
4247
|
+
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
4248
|
+
FILES: formatFileList(inputFiles, config.filesFormat),
|
|
4249
|
+
OUTPUT_FILE: shellEscape(outputFilePath)
|
|
4250
|
+
},
|
|
4251
|
+
promptFilePath
|
|
4207
4252
|
};
|
|
4208
4253
|
}
|
|
4254
|
+
async function cleanupTempFile(filePath, keepTempFiles) {
|
|
4255
|
+
if (!filePath || keepTempFiles) {
|
|
4256
|
+
return;
|
|
4257
|
+
}
|
|
4258
|
+
await fs.unlink(filePath).catch(() => {
|
|
4259
|
+
});
|
|
4260
|
+
}
|
|
4209
4261
|
function normalizeInputFiles2(inputFiles) {
|
|
4210
4262
|
if (!inputFiles || inputFiles.length === 0) {
|
|
4211
4263
|
return void 0;
|
|
@@ -8086,7 +8138,7 @@ async function discoverProviders(registry, baseDir) {
|
|
|
8086
8138
|
}
|
|
8087
8139
|
registry.register(kindName, (target) => {
|
|
8088
8140
|
return new CliProvider(target.name, {
|
|
8089
|
-
|
|
8141
|
+
command: `bun run ${filePath} {PROMPT}`
|
|
8090
8142
|
});
|
|
8091
8143
|
});
|
|
8092
8144
|
discoveredKinds.push(kindName);
|
|
@@ -8599,13 +8651,13 @@ function toCamelCaseDeep(obj) {
|
|
|
8599
8651
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
8600
8652
|
var CodeEvaluator = class {
|
|
8601
8653
|
kind = "code";
|
|
8602
|
-
|
|
8654
|
+
command;
|
|
8603
8655
|
cwd;
|
|
8604
8656
|
agentTimeoutMs;
|
|
8605
8657
|
config;
|
|
8606
8658
|
target;
|
|
8607
8659
|
constructor(options) {
|
|
8608
|
-
this.
|
|
8660
|
+
this.command = options.command ?? options.script ?? [];
|
|
8609
8661
|
this.cwd = options.cwd;
|
|
8610
8662
|
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
8611
8663
|
this.config = options.config;
|
|
@@ -8664,7 +8716,7 @@ var CodeEvaluator = class {
|
|
|
8664
8716
|
const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
|
|
8665
8717
|
try {
|
|
8666
8718
|
const stdout = await executeScript(
|
|
8667
|
-
this.
|
|
8719
|
+
this.command,
|
|
8668
8720
|
inputPayload,
|
|
8669
8721
|
this.agentTimeoutMs,
|
|
8670
8722
|
this.cwd,
|
|
@@ -8678,7 +8730,7 @@ var CodeEvaluator = class {
|
|
|
8678
8730
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
8679
8731
|
const proxyUsage = getProxyUsage?.();
|
|
8680
8732
|
const evaluatorRawRequest = {
|
|
8681
|
-
|
|
8733
|
+
command: this.command,
|
|
8682
8734
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
8683
8735
|
...proxyUsage ? {
|
|
8684
8736
|
target_proxy: {
|
|
@@ -8708,7 +8760,7 @@ var CodeEvaluator = class {
|
|
|
8708
8760
|
expectedAspectCount: 1,
|
|
8709
8761
|
reasoning: message,
|
|
8710
8762
|
evaluatorRawRequest: {
|
|
8711
|
-
|
|
8763
|
+
command: this.command,
|
|
8712
8764
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
8713
8765
|
...proxyUsage ? {
|
|
8714
8766
|
target_proxy: {
|
|
@@ -11638,7 +11690,7 @@ var llmJudgeFactory = (config, context) => {
|
|
|
11638
11690
|
var codeFactory = (config, context) => {
|
|
11639
11691
|
const c = config;
|
|
11640
11692
|
return new CodeEvaluator({
|
|
11641
|
-
|
|
11693
|
+
command: c.command ?? c.script ?? [],
|
|
11642
11694
|
cwd: c.resolvedCwd ?? c.cwd,
|
|
11643
11695
|
agentTimeoutMs: context.agentTimeoutMs,
|
|
11644
11696
|
config: c.config,
|
|
@@ -11820,7 +11872,7 @@ async function discoverAssertions(registry, baseDir) {
|
|
|
11820
11872
|
}
|
|
11821
11873
|
const factory = (_config, context) => {
|
|
11822
11874
|
return new CodeEvaluator({
|
|
11823
|
-
|
|
11875
|
+
command: ["bun", "run", filePath],
|
|
11824
11876
|
agentTimeoutMs: context.agentTimeoutMs
|
|
11825
11877
|
});
|
|
11826
11878
|
};
|
|
@@ -12174,7 +12226,8 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
12174
12226
|
});
|
|
12175
12227
|
const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
12176
12228
|
const cwd = config.cwd;
|
|
12177
|
-
const
|
|
12229
|
+
const commandArray = config.command ?? config.script ?? [];
|
|
12230
|
+
const result = await execFileWithStdin(commandArray, stdin, {
|
|
12178
12231
|
timeoutMs,
|
|
12179
12232
|
cwd
|
|
12180
12233
|
});
|
|
@@ -12221,7 +12274,8 @@ async function runEvaluation(options) {
|
|
|
12221
12274
|
keepWorkspaces,
|
|
12222
12275
|
cleanupWorkspaces,
|
|
12223
12276
|
trials,
|
|
12224
|
-
streamCallbacks
|
|
12277
|
+
streamCallbacks,
|
|
12278
|
+
totalBudgetUsd
|
|
12225
12279
|
} = options;
|
|
12226
12280
|
let useCache = options.useCache;
|
|
12227
12281
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -12394,10 +12448,39 @@ async function runEvaluation(options) {
|
|
|
12394
12448
|
let nextWorkerId = 1;
|
|
12395
12449
|
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
12396
12450
|
let beforeAllOutputAttached = false;
|
|
12451
|
+
let cumulativeBudgetCost = 0;
|
|
12452
|
+
let budgetExhausted = false;
|
|
12397
12453
|
const promises = filteredEvalCases.map(
|
|
12398
12454
|
(evalCase) => limit(async () => {
|
|
12399
12455
|
const workerId = nextWorkerId++;
|
|
12400
12456
|
workerIdByEvalId.set(evalCase.id, workerId);
|
|
12457
|
+
if (totalBudgetUsd !== void 0 && budgetExhausted) {
|
|
12458
|
+
const budgetResult = {
|
|
12459
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
12460
|
+
testId: evalCase.id,
|
|
12461
|
+
dataset: evalCase.dataset,
|
|
12462
|
+
score: 0,
|
|
12463
|
+
hits: [],
|
|
12464
|
+
misses: [],
|
|
12465
|
+
answer: "",
|
|
12466
|
+
target: target.name,
|
|
12467
|
+
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
12468
|
+
budgetExceeded: true
|
|
12469
|
+
};
|
|
12470
|
+
if (onProgress) {
|
|
12471
|
+
await onProgress({
|
|
12472
|
+
workerId,
|
|
12473
|
+
testId: evalCase.id,
|
|
12474
|
+
status: "failed",
|
|
12475
|
+
completedAt: Date.now(),
|
|
12476
|
+
error: budgetResult.error
|
|
12477
|
+
});
|
|
12478
|
+
}
|
|
12479
|
+
if (onResult) {
|
|
12480
|
+
await onResult(budgetResult);
|
|
12481
|
+
}
|
|
12482
|
+
return budgetResult;
|
|
12483
|
+
}
|
|
12401
12484
|
if (onProgress) {
|
|
12402
12485
|
await onProgress({
|
|
12403
12486
|
workerId,
|
|
@@ -12431,6 +12514,23 @@ async function runEvaluation(options) {
|
|
|
12431
12514
|
typeRegistry
|
|
12432
12515
|
};
|
|
12433
12516
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
12517
|
+
if (totalBudgetUsd !== void 0) {
|
|
12518
|
+
let caseCost;
|
|
12519
|
+
if (result.trials && result.trials.length > 0) {
|
|
12520
|
+
const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
|
|
12521
|
+
if (trialCostSum > 0) {
|
|
12522
|
+
caseCost = trialCostSum;
|
|
12523
|
+
}
|
|
12524
|
+
} else {
|
|
12525
|
+
caseCost = result.trace?.costUsd;
|
|
12526
|
+
}
|
|
12527
|
+
if (caseCost !== void 0) {
|
|
12528
|
+
cumulativeBudgetCost += caseCost;
|
|
12529
|
+
if (cumulativeBudgetCost >= totalBudgetUsd) {
|
|
12530
|
+
budgetExhausted = true;
|
|
12531
|
+
}
|
|
12532
|
+
}
|
|
12533
|
+
}
|
|
12434
12534
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
12435
12535
|
result = { ...result, beforeAllOutput };
|
|
12436
12536
|
beforeAllOutputAttached = true;
|