@agentv/core 4.25.1 → 4.25.3-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-MUIGGIP3.js +7 -0
- package/dist/chunk-5XV3FAAD.js +616 -0
- package/dist/chunk-5XV3FAAD.js.map +1 -0
- package/dist/{chunk-6HLBKYE2.js → chunk-CALQDF2Y.js} +1 -1
- package/dist/chunk-CALQDF2Y.js.map +1 -0
- package/dist/{chunk-IXTJEXWN.js → chunk-EVEZQXIS.js} +187 -551
- package/dist/chunk-EVEZQXIS.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +591 -419
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +78 -8
- package/dist/index.d.ts +78 -8
- package/dist/index.js +7 -12
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-E6MROJGR.js +12 -0
- package/package.json +2 -7
- package/dist/agentv-provider-TXM4UEUT.js +0 -7
- package/dist/chunk-6HLBKYE2.js.map +0 -1
- package/dist/chunk-IXTJEXWN.js.map +0 -1
- package/dist/chunk-PRNXHNLF.js +0 -65
- package/dist/chunk-PRNXHNLF.js.map +0 -1
- package/dist/ts-eval-loader-4CFPGHGT.js +0 -12
- /package/dist/{agentv-provider-TXM4UEUT.js.map → agentv-provider-MUIGGIP3.js.map} +0 -0
- /package/dist/{ts-eval-loader-4CFPGHGT.js.map → ts-eval-loader-E6MROJGR.js.map} +0 -0
|
@@ -17,14 +17,19 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveDelegatedTargetDefinition,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-CALQDF2Y.js";
|
|
21
21
|
import {
|
|
22
22
|
execFileWithStdin,
|
|
23
23
|
execShellWithStdin
|
|
24
24
|
} from "./chunk-3WGHC7LC.js";
|
|
25
25
|
import {
|
|
26
|
-
AgentvProvider
|
|
27
|
-
|
|
26
|
+
AgentvProvider,
|
|
27
|
+
AnthropicProvider,
|
|
28
|
+
AzureProvider,
|
|
29
|
+
GeminiProvider,
|
|
30
|
+
OpenAIProvider,
|
|
31
|
+
OpenRouterProvider
|
|
32
|
+
} from "./chunk-5XV3FAAD.js";
|
|
28
33
|
|
|
29
34
|
// src/evaluation/loaders/ts-eval-loader.ts
|
|
30
35
|
import path46 from "node:path";
|
|
@@ -730,6 +735,8 @@ var CodeGrader = class {
|
|
|
730
735
|
const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
|
|
731
736
|
try {
|
|
732
737
|
let stdout;
|
|
738
|
+
let exitCode = 0;
|
|
739
|
+
let execStderr = "";
|
|
733
740
|
if (context.dockerConfig) {
|
|
734
741
|
const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27.js");
|
|
735
742
|
const dockerProvider = new DockerWorkspaceProvider(context.dockerConfig);
|
|
@@ -738,31 +745,42 @@ var CodeGrader = class {
|
|
|
738
745
|
stdin: inputPayload,
|
|
739
746
|
repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos)
|
|
740
747
|
});
|
|
741
|
-
|
|
742
|
-
const trimmedErr = result.stderr.trim();
|
|
743
|
-
throw new Error(
|
|
744
|
-
trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
|
|
745
|
-
);
|
|
746
|
-
}
|
|
748
|
+
exitCode = result.exitCode;
|
|
747
749
|
stdout = result.stdout.trim();
|
|
750
|
+
execStderr = result.stderr;
|
|
748
751
|
} else {
|
|
749
|
-
|
|
752
|
+
const result = await runScriptRaw(
|
|
750
753
|
this.command,
|
|
751
754
|
inputPayload,
|
|
752
755
|
this.agentTimeoutMs,
|
|
753
756
|
this.cwd,
|
|
754
757
|
env
|
|
755
758
|
);
|
|
759
|
+
exitCode = result.exitCode;
|
|
760
|
+
stdout = result.stdout.trim();
|
|
761
|
+
execStderr = result.stderr;
|
|
756
762
|
}
|
|
757
|
-
const
|
|
758
|
-
const
|
|
759
|
-
|
|
763
|
+
const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
|
|
764
|
+
const hasStderr = execStderr.trim().length > 0;
|
|
765
|
+
if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
|
|
766
|
+
const trimmedErr = formatStderr(execStderr);
|
|
767
|
+
throw new Error(
|
|
768
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
769
|
+
);
|
|
770
|
+
}
|
|
771
|
+
const rawParsed = parseJsonSafe(stdout);
|
|
772
|
+
const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
|
|
773
|
+
const passed = exitCode === 0;
|
|
774
|
+
const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
760
775
|
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
761
776
|
).map((a) => ({
|
|
762
777
|
text: String(a.text),
|
|
763
778
|
passed: Boolean(a.passed),
|
|
764
779
|
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
765
|
-
})) : [];
|
|
780
|
+
})) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
|
|
781
|
+
const score = parsed != null ? clampScore(
|
|
782
|
+
typeof parsed.score === "number" ? parsed.score : assertions.length > 0 ? assertions.filter((a) => a.passed).length / assertions.length : 0
|
|
783
|
+
) : passed ? 1 : 0;
|
|
766
784
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
767
785
|
const proxyUsage = getProxyUsage?.();
|
|
768
786
|
const graderRawRequest = {
|
|
@@ -820,8 +838,17 @@ var CodeGrader = class {
|
|
|
820
838
|
}
|
|
821
839
|
}
|
|
822
840
|
};
|
|
841
|
+
async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
842
|
+
return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
843
|
+
}
|
|
823
844
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
824
|
-
const { stdout, stderr, exitCode } =
|
|
845
|
+
const { stdout, stderr, exitCode } = await runScriptRaw(
|
|
846
|
+
scriptPath,
|
|
847
|
+
input,
|
|
848
|
+
agentTimeoutMs,
|
|
849
|
+
cwd,
|
|
850
|
+
env
|
|
851
|
+
);
|
|
825
852
|
if (exitCode !== 0) {
|
|
826
853
|
const trimmedErr = formatStderr(stderr);
|
|
827
854
|
throw new Error(
|
|
@@ -841,13 +868,9 @@ function formatStderr(stderr) {
|
|
|
841
868
|
${tail}`;
|
|
842
869
|
}
|
|
843
870
|
|
|
844
|
-
// src/evaluation/graders/composite.ts
|
|
845
|
-
import { generateText as generateText2 } from "ai";
|
|
846
|
-
|
|
847
871
|
// src/evaluation/graders/llm-grader.ts
|
|
848
872
|
import fs from "node:fs/promises";
|
|
849
873
|
import path3 from "node:path";
|
|
850
|
-
import { generateText, stepCountIs, tool } from "ai";
|
|
851
874
|
import { z } from "zod";
|
|
852
875
|
|
|
853
876
|
// src/evaluation/content-preprocessor.ts
|
|
@@ -1357,18 +1380,15 @@ ${context.toolCalls}`;
|
|
|
1357
1380
|
}
|
|
1358
1381
|
}
|
|
1359
1382
|
// ---------------------------------------------------------------------------
|
|
1360
|
-
// Built-in agent mode (agentv provider —
|
|
1383
|
+
// Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
|
|
1361
1384
|
// ---------------------------------------------------------------------------
|
|
1362
1385
|
/**
|
|
1363
|
-
* Built-in mode:
|
|
1386
|
+
* Built-in mode: drives the grader through provider.invoke() with the
|
|
1387
|
+
* sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
|
|
1388
|
+
* provider runs the agent loop (tool call → tool execute → next model
|
|
1389
|
+
* turn) until the model stops requesting tools or maxSteps is hit.
|
|
1364
1390
|
*/
|
|
1365
1391
|
async evaluateBuiltIn(context, graderProvider) {
|
|
1366
|
-
const model = graderProvider.asLanguageModel?.();
|
|
1367
|
-
if (!model) {
|
|
1368
|
-
throw new Error(
|
|
1369
|
-
`Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
|
|
1370
|
-
);
|
|
1371
|
-
}
|
|
1372
1392
|
const workspacePath = context.workspacePath;
|
|
1373
1393
|
if (!workspacePath) {
|
|
1374
1394
|
throw new Error(
|
|
@@ -1387,18 +1407,21 @@ ${context.toolCalls}`;
|
|
|
1387
1407
|
maxSteps: this.maxSteps
|
|
1388
1408
|
};
|
|
1389
1409
|
try {
|
|
1390
|
-
const
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1410
|
+
const response = await graderProvider.invoke({
|
|
1411
|
+
question: userPrompt,
|
|
1412
|
+
systemPrompt,
|
|
1413
|
+
evalCaseId: context.evalCase.id,
|
|
1414
|
+
attempt: context.attempt,
|
|
1415
|
+
temperature: this.temperature ?? 0,
|
|
1394
1416
|
tools: fsTools,
|
|
1395
|
-
|
|
1396
|
-
temperature: this.temperature ?? 0
|
|
1417
|
+
maxSteps: this.maxSteps
|
|
1397
1418
|
});
|
|
1398
|
-
const
|
|
1419
|
+
const text = extractLastAssistantContent(response.output);
|
|
1420
|
+
const stepCount = response.steps?.count ?? 1;
|
|
1421
|
+
const toolCallCount = response.steps?.toolCallCount ?? 0;
|
|
1399
1422
|
const details = {
|
|
1400
1423
|
mode: "built-in",
|
|
1401
|
-
steps:
|
|
1424
|
+
steps: stepCount,
|
|
1402
1425
|
tool_calls: toolCallCount
|
|
1403
1426
|
};
|
|
1404
1427
|
return this.parseAgentResult(
|
|
@@ -1850,43 +1873,14 @@ ${outputSchema}`;
|
|
|
1850
1873
|
}
|
|
1851
1874
|
async generateStructuredResponse(options) {
|
|
1852
1875
|
const { context, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
1853
|
-
const model = graderProvider.asLanguageModel?.();
|
|
1854
|
-
if (model) {
|
|
1855
|
-
const modelOptions = {
|
|
1856
|
-
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
1857
|
-
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
1858
|
-
};
|
|
1859
|
-
const hasImages = images && images.length > 0;
|
|
1860
|
-
const result = hasImages ? await generateText({
|
|
1861
|
-
model,
|
|
1862
|
-
system: systemPrompt,
|
|
1863
|
-
messages: [
|
|
1864
|
-
{
|
|
1865
|
-
role: "user",
|
|
1866
|
-
content: [
|
|
1867
|
-
{ type: "text", text: userPrompt },
|
|
1868
|
-
...toAiSdkImageParts(images)
|
|
1869
|
-
]
|
|
1870
|
-
}
|
|
1871
|
-
],
|
|
1872
|
-
...modelOptions
|
|
1873
|
-
}) : await generateText({
|
|
1874
|
-
model,
|
|
1875
|
-
system: systemPrompt,
|
|
1876
|
-
prompt: userPrompt,
|
|
1877
|
-
...modelOptions
|
|
1878
|
-
});
|
|
1879
|
-
const rawUsage = result.usage;
|
|
1880
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
1881
|
-
return { text: result.text, tokenUsage };
|
|
1882
|
-
}
|
|
1883
1876
|
const response = await graderProvider.invoke({
|
|
1884
1877
|
question: userPrompt,
|
|
1885
1878
|
systemPrompt,
|
|
1886
1879
|
evalCaseId: context.evalCase.id,
|
|
1887
1880
|
attempt: context.attempt,
|
|
1888
1881
|
maxOutputTokens: this.maxOutputTokens,
|
|
1889
|
-
temperature: this.temperature
|
|
1882
|
+
temperature: this.temperature,
|
|
1883
|
+
...images && images.length > 0 ? { images } : {}
|
|
1890
1884
|
});
|
|
1891
1885
|
return {
|
|
1892
1886
|
text: extractLastAssistantContent(response.output),
|
|
@@ -2083,13 +2077,6 @@ function extractImageBlocks(messages) {
|
|
|
2083
2077
|
}
|
|
2084
2078
|
return images;
|
|
2085
2079
|
}
|
|
2086
|
-
function toAiSdkImageParts(images) {
|
|
2087
|
-
return images.map((img) => ({
|
|
2088
|
-
type: "image",
|
|
2089
|
-
image: img.source,
|
|
2090
|
-
mediaType: img.media_type || void 0
|
|
2091
|
-
}));
|
|
2092
|
-
}
|
|
2093
2080
|
function resolveSandboxed(basePath, relativePath) {
|
|
2094
2081
|
const resolved = path3.resolve(basePath, relativePath);
|
|
2095
2082
|
if (!resolved.startsWith(basePath + path3.sep) && resolved !== basePath) {
|
|
@@ -2098,15 +2085,24 @@ function resolveSandboxed(basePath, relativePath) {
|
|
|
2098
2085
|
return resolved;
|
|
2099
2086
|
}
|
|
2100
2087
|
function createFilesystemTools(workspacePath) {
|
|
2101
|
-
return
|
|
2102
|
-
|
|
2088
|
+
return [
|
|
2089
|
+
{
|
|
2090
|
+
name: "list_files",
|
|
2103
2091
|
description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2092
|
+
parameters: {
|
|
2093
|
+
type: "object",
|
|
2094
|
+
properties: {
|
|
2095
|
+
path: {
|
|
2096
|
+
type: "string",
|
|
2097
|
+
description: 'Relative path within workspace (use "." for root)',
|
|
2098
|
+
default: "."
|
|
2099
|
+
}
|
|
2100
|
+
}
|
|
2101
|
+
},
|
|
2107
2102
|
execute: async (input) => {
|
|
2103
|
+
const args = input ?? {};
|
|
2108
2104
|
try {
|
|
2109
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
2105
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
2110
2106
|
const entries = await fs.readdir(resolved, { withFileTypes: true });
|
|
2111
2107
|
return entries.map((e) => ({
|
|
2112
2108
|
name: e.name,
|
|
@@ -2116,18 +2112,25 @@ function createFilesystemTools(workspacePath) {
|
|
|
2116
2112
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
2117
2113
|
}
|
|
2118
2114
|
}
|
|
2119
|
-
}
|
|
2120
|
-
|
|
2115
|
+
},
|
|
2116
|
+
{
|
|
2117
|
+
name: "read_file",
|
|
2121
2118
|
description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2119
|
+
parameters: {
|
|
2120
|
+
type: "object",
|
|
2121
|
+
properties: {
|
|
2122
|
+
path: { type: "string", description: "Relative path to file within workspace" }
|
|
2123
|
+
},
|
|
2124
|
+
required: ["path"]
|
|
2125
|
+
},
|
|
2125
2126
|
execute: async (input) => {
|
|
2127
|
+
const args = input ?? {};
|
|
2128
|
+
const relPath = args.path ?? "";
|
|
2126
2129
|
try {
|
|
2127
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
2130
|
+
const resolved = resolveSandboxed(workspacePath, relPath);
|
|
2128
2131
|
const stat10 = await fs.stat(resolved);
|
|
2129
2132
|
if (stat10.isDirectory()) {
|
|
2130
|
-
return { error: `'${
|
|
2133
|
+
return { error: `'${relPath}' is a directory, not a file` };
|
|
2131
2134
|
}
|
|
2132
2135
|
const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
|
|
2133
2136
|
const fd = await fs.open(resolved, "r");
|
|
@@ -2143,19 +2146,29 @@ function createFilesystemTools(workspacePath) {
|
|
|
2143
2146
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
2144
2147
|
}
|
|
2145
2148
|
}
|
|
2146
|
-
}
|
|
2147
|
-
|
|
2149
|
+
},
|
|
2150
|
+
{
|
|
2151
|
+
name: "search_files",
|
|
2148
2152
|
description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
+
parameters: {
|
|
2154
|
+
type: "object",
|
|
2155
|
+
properties: {
|
|
2156
|
+
pattern: { type: "string", description: "Regex pattern to search for" },
|
|
2157
|
+
path: {
|
|
2158
|
+
type: "string",
|
|
2159
|
+
description: 'Relative path to search within (use "." for root)',
|
|
2160
|
+
default: "."
|
|
2161
|
+
}
|
|
2162
|
+
},
|
|
2163
|
+
required: ["pattern"]
|
|
2164
|
+
},
|
|
2153
2165
|
execute: async (input) => {
|
|
2166
|
+
const args = input ?? {};
|
|
2154
2167
|
try {
|
|
2155
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
2168
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
2156
2169
|
let regex;
|
|
2157
2170
|
try {
|
|
2158
|
-
regex = new RegExp(
|
|
2171
|
+
regex = new RegExp(args.pattern ?? "", "gi");
|
|
2159
2172
|
} catch (regexErr) {
|
|
2160
2173
|
return {
|
|
2161
2174
|
error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
|
|
@@ -2168,8 +2181,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
2168
2181
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
2169
2182
|
}
|
|
2170
2183
|
}
|
|
2171
|
-
}
|
|
2172
|
-
|
|
2184
|
+
}
|
|
2185
|
+
];
|
|
2173
2186
|
}
|
|
2174
2187
|
async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
2175
2188
|
if (matches.length >= MAX_SEARCH_MATCHES) return;
|
|
@@ -2449,25 +2462,6 @@ var CompositeGrader = class {
|
|
|
2449
2462
|
target: graderProvider.targetName
|
|
2450
2463
|
};
|
|
2451
2464
|
try {
|
|
2452
|
-
const model = graderProvider.asLanguageModel?.();
|
|
2453
|
-
if (model) {
|
|
2454
|
-
const { text } = await generateText2({
|
|
2455
|
-
model,
|
|
2456
|
-
system: systemPrompt,
|
|
2457
|
-
prompt: userPrompt
|
|
2458
|
-
});
|
|
2459
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
2460
|
-
const score2 = clampScore(data2.score);
|
|
2461
|
-
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
2462
|
-
return {
|
|
2463
|
-
score: score2,
|
|
2464
|
-
verdict: scoreToVerdict(score2),
|
|
2465
|
-
assertions: assertions2,
|
|
2466
|
-
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
2467
|
-
graderRawRequest,
|
|
2468
|
-
scores
|
|
2469
|
-
};
|
|
2470
|
-
}
|
|
2471
2465
|
const response = await graderProvider.invoke({
|
|
2472
2466
|
question: userPrompt,
|
|
2473
2467
|
systemPrompt,
|
|
@@ -2625,7 +2619,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
|
|
|
2625
2619
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
2626
2620
|
if (summary.eventCount === 0) return void 0;
|
|
2627
2621
|
const explorationCalls = explorationTools.reduce(
|
|
2628
|
-
(sum,
|
|
2622
|
+
(sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
|
|
2629
2623
|
0
|
|
2630
2624
|
);
|
|
2631
2625
|
return explorationCalls / summary.eventCount;
|
|
@@ -4261,422 +4255,6 @@ function runEqualsAssertion(output, value) {
|
|
|
4261
4255
|
};
|
|
4262
4256
|
}
|
|
4263
4257
|
|
|
4264
|
-
// src/evaluation/providers/ai-sdk.ts
|
|
4265
|
-
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
4266
|
-
import { createAzure } from "@ai-sdk/azure";
|
|
4267
|
-
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
|
4268
|
-
import { createOpenAI } from "@ai-sdk/openai";
|
|
4269
|
-
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
|
4270
|
-
import { generateText as generateText3 } from "ai";
|
|
4271
|
-
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
4272
|
-
var OpenAIProvider = class {
|
|
4273
|
-
constructor(targetName, config) {
|
|
4274
|
-
this.config = config;
|
|
4275
|
-
this.id = `openai:${targetName}`;
|
|
4276
|
-
this.targetName = targetName;
|
|
4277
|
-
this.defaults = {
|
|
4278
|
-
temperature: config.temperature,
|
|
4279
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4280
|
-
};
|
|
4281
|
-
this.retryConfig = config.retry;
|
|
4282
|
-
const openai = createOpenAI({
|
|
4283
|
-
apiKey: config.apiKey,
|
|
4284
|
-
baseURL: config.baseURL
|
|
4285
|
-
});
|
|
4286
|
-
this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
|
|
4287
|
-
}
|
|
4288
|
-
id;
|
|
4289
|
-
kind = "openai";
|
|
4290
|
-
targetName;
|
|
4291
|
-
model;
|
|
4292
|
-
defaults;
|
|
4293
|
-
retryConfig;
|
|
4294
|
-
async invoke(request) {
|
|
4295
|
-
return invokeModel({
|
|
4296
|
-
model: this.model,
|
|
4297
|
-
request,
|
|
4298
|
-
defaults: this.defaults,
|
|
4299
|
-
retryConfig: this.retryConfig
|
|
4300
|
-
});
|
|
4301
|
-
}
|
|
4302
|
-
asLanguageModel() {
|
|
4303
|
-
return this.model;
|
|
4304
|
-
}
|
|
4305
|
-
};
|
|
4306
|
-
var AzureProvider = class {
|
|
4307
|
-
constructor(targetName, config) {
|
|
4308
|
-
this.config = config;
|
|
4309
|
-
this.id = `azure:${targetName}`;
|
|
4310
|
-
this.targetName = targetName;
|
|
4311
|
-
this.defaults = {
|
|
4312
|
-
temperature: config.temperature,
|
|
4313
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4314
|
-
};
|
|
4315
|
-
this.retryConfig = config.retry;
|
|
4316
|
-
const azure = createAzure(buildAzureOptions(config));
|
|
4317
|
-
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
4318
|
-
}
|
|
4319
|
-
id;
|
|
4320
|
-
kind = "azure";
|
|
4321
|
-
targetName;
|
|
4322
|
-
model;
|
|
4323
|
-
defaults;
|
|
4324
|
-
retryConfig;
|
|
4325
|
-
async invoke(request) {
|
|
4326
|
-
return invokeModel({
|
|
4327
|
-
model: this.model,
|
|
4328
|
-
request,
|
|
4329
|
-
defaults: this.defaults,
|
|
4330
|
-
retryConfig: this.retryConfig
|
|
4331
|
-
});
|
|
4332
|
-
}
|
|
4333
|
-
asLanguageModel() {
|
|
4334
|
-
return this.model;
|
|
4335
|
-
}
|
|
4336
|
-
};
|
|
4337
|
-
var OpenRouterProvider = class {
|
|
4338
|
-
constructor(targetName, config) {
|
|
4339
|
-
this.config = config;
|
|
4340
|
-
this.id = `openrouter:${targetName}`;
|
|
4341
|
-
this.targetName = targetName;
|
|
4342
|
-
this.defaults = {
|
|
4343
|
-
temperature: config.temperature,
|
|
4344
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4345
|
-
};
|
|
4346
|
-
this.retryConfig = config.retry;
|
|
4347
|
-
const openrouter = createOpenRouter({
|
|
4348
|
-
apiKey: config.apiKey
|
|
4349
|
-
});
|
|
4350
|
-
this.model = openrouter(config.model);
|
|
4351
|
-
}
|
|
4352
|
-
id;
|
|
4353
|
-
kind = "openrouter";
|
|
4354
|
-
targetName;
|
|
4355
|
-
model;
|
|
4356
|
-
defaults;
|
|
4357
|
-
retryConfig;
|
|
4358
|
-
async invoke(request) {
|
|
4359
|
-
return invokeModel({
|
|
4360
|
-
model: this.model,
|
|
4361
|
-
request,
|
|
4362
|
-
defaults: this.defaults,
|
|
4363
|
-
retryConfig: this.retryConfig
|
|
4364
|
-
});
|
|
4365
|
-
}
|
|
4366
|
-
asLanguageModel() {
|
|
4367
|
-
return this.model;
|
|
4368
|
-
}
|
|
4369
|
-
};
|
|
4370
|
-
var AnthropicProvider = class {
|
|
4371
|
-
constructor(targetName, config) {
|
|
4372
|
-
this.config = config;
|
|
4373
|
-
this.id = `anthropic:${targetName}`;
|
|
4374
|
-
this.targetName = targetName;
|
|
4375
|
-
this.defaults = {
|
|
4376
|
-
temperature: config.temperature,
|
|
4377
|
-
maxOutputTokens: config.maxOutputTokens,
|
|
4378
|
-
thinkingBudget: config.thinkingBudget
|
|
4379
|
-
};
|
|
4380
|
-
this.retryConfig = config.retry;
|
|
4381
|
-
const anthropic = createAnthropic({
|
|
4382
|
-
apiKey: config.apiKey
|
|
4383
|
-
});
|
|
4384
|
-
this.model = anthropic(config.model);
|
|
4385
|
-
}
|
|
4386
|
-
id;
|
|
4387
|
-
kind = "anthropic";
|
|
4388
|
-
targetName;
|
|
4389
|
-
model;
|
|
4390
|
-
defaults;
|
|
4391
|
-
retryConfig;
|
|
4392
|
-
async invoke(request) {
|
|
4393
|
-
const providerOptions = buildAnthropicProviderOptions(this.defaults);
|
|
4394
|
-
return invokeModel({
|
|
4395
|
-
model: this.model,
|
|
4396
|
-
request,
|
|
4397
|
-
defaults: this.defaults,
|
|
4398
|
-
retryConfig: this.retryConfig,
|
|
4399
|
-
providerOptions
|
|
4400
|
-
});
|
|
4401
|
-
}
|
|
4402
|
-
asLanguageModel() {
|
|
4403
|
-
return this.model;
|
|
4404
|
-
}
|
|
4405
|
-
};
|
|
4406
|
-
var GeminiProvider = class {
|
|
4407
|
-
constructor(targetName, config) {
|
|
4408
|
-
this.config = config;
|
|
4409
|
-
this.id = `gemini:${targetName}`;
|
|
4410
|
-
this.targetName = targetName;
|
|
4411
|
-
this.defaults = {
|
|
4412
|
-
temperature: config.temperature,
|
|
4413
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4414
|
-
};
|
|
4415
|
-
this.retryConfig = config.retry;
|
|
4416
|
-
const google = createGoogleGenerativeAI({
|
|
4417
|
-
apiKey: config.apiKey
|
|
4418
|
-
});
|
|
4419
|
-
this.model = google(config.model);
|
|
4420
|
-
}
|
|
4421
|
-
id;
|
|
4422
|
-
kind = "gemini";
|
|
4423
|
-
targetName;
|
|
4424
|
-
model;
|
|
4425
|
-
defaults;
|
|
4426
|
-
retryConfig;
|
|
4427
|
-
async invoke(request) {
|
|
4428
|
-
return invokeModel({
|
|
4429
|
-
model: this.model,
|
|
4430
|
-
request,
|
|
4431
|
-
defaults: this.defaults,
|
|
4432
|
-
retryConfig: this.retryConfig
|
|
4433
|
-
});
|
|
4434
|
-
}
|
|
4435
|
-
asLanguageModel() {
|
|
4436
|
-
return this.model;
|
|
4437
|
-
}
|
|
4438
|
-
};
|
|
4439
|
-
function buildAzureOptions(config) {
|
|
4440
|
-
const options = {
|
|
4441
|
-
apiKey: config.apiKey,
|
|
4442
|
-
apiVersion: config.version,
|
|
4443
|
-
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
4444
|
-
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
4445
|
-
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
4446
|
-
};
|
|
4447
|
-
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
4448
|
-
if (baseURL) {
|
|
4449
|
-
options.baseURL = baseURL;
|
|
4450
|
-
} else {
|
|
4451
|
-
options.resourceName = config.resourceName;
|
|
4452
|
-
}
|
|
4453
|
-
return options;
|
|
4454
|
-
}
|
|
4455
|
-
function normalizeAzureBaseUrl(resourceName) {
|
|
4456
|
-
const trimmed = resourceName.trim();
|
|
4457
|
-
if (!/^https?:\/\//i.test(trimmed)) {
|
|
4458
|
-
return void 0;
|
|
4459
|
-
}
|
|
4460
|
-
const withoutSlash = trimmed.replace(/\/+$/, "");
|
|
4461
|
-
const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
|
|
4462
|
-
return normalized;
|
|
4463
|
-
}
|
|
4464
|
-
function buildAnthropicProviderOptions(defaults) {
|
|
4465
|
-
if (defaults.thinkingBudget === void 0) {
|
|
4466
|
-
return void 0;
|
|
4467
|
-
}
|
|
4468
|
-
return {
|
|
4469
|
-
anthropic: {
|
|
4470
|
-
thinking: {
|
|
4471
|
-
type: "enabled",
|
|
4472
|
-
budgetTokens: defaults.thinkingBudget
|
|
4473
|
-
}
|
|
4474
|
-
}
|
|
4475
|
-
};
|
|
4476
|
-
}
|
|
4477
|
-
function buildChatPrompt(request) {
|
|
4478
|
-
const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
|
|
4479
|
-
if (provided) {
|
|
4480
|
-
const hasSystemMessage = provided.some((message) => message.role === "system");
|
|
4481
|
-
if (hasSystemMessage) {
|
|
4482
|
-
return provided;
|
|
4483
|
-
}
|
|
4484
|
-
const systemContent2 = resolveSystemContent(request);
|
|
4485
|
-
return [{ role: "system", content: systemContent2 }, ...provided];
|
|
4486
|
-
}
|
|
4487
|
-
const systemContent = resolveSystemContent(request);
|
|
4488
|
-
const userContent = request.question.trim();
|
|
4489
|
-
const prompt = [
|
|
4490
|
-
{ role: "system", content: systemContent },
|
|
4491
|
-
{ role: "user", content: userContent }
|
|
4492
|
-
];
|
|
4493
|
-
return prompt;
|
|
4494
|
-
}
|
|
4495
|
-
function resolveSystemContent(request) {
|
|
4496
|
-
const systemSegments = [];
|
|
4497
|
-
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
4498
|
-
systemSegments.push(request.systemPrompt.trim());
|
|
4499
|
-
} else {
|
|
4500
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
4501
|
-
}
|
|
4502
|
-
return systemSegments.join("\n\n");
|
|
4503
|
-
}
|
|
4504
|
-
function toModelMessages(chatPrompt) {
|
|
4505
|
-
return chatPrompt.map((message) => {
|
|
4506
|
-
if (message.role === "tool" || message.role === "function") {
|
|
4507
|
-
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
4508
|
-
return {
|
|
4509
|
-
role: "assistant",
|
|
4510
|
-
content: `${prefix}${message.content}`
|
|
4511
|
-
};
|
|
4512
|
-
}
|
|
4513
|
-
if (message.role === "assistant" || message.role === "system" || message.role === "user") {
|
|
4514
|
-
return {
|
|
4515
|
-
role: message.role,
|
|
4516
|
-
content: message.content
|
|
4517
|
-
};
|
|
4518
|
-
}
|
|
4519
|
-
return {
|
|
4520
|
-
role: "user",
|
|
4521
|
-
content: message.content
|
|
4522
|
-
};
|
|
4523
|
-
});
|
|
4524
|
-
}
|
|
4525
|
-
function resolveModelSettings(request, defaults) {
|
|
4526
|
-
const temperature = request.temperature ?? defaults.temperature;
|
|
4527
|
-
const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
4528
|
-
return {
|
|
4529
|
-
temperature,
|
|
4530
|
-
maxOutputTokens
|
|
4531
|
-
};
|
|
4532
|
-
}
|
|
4533
|
-
async function invokeModel(options) {
|
|
4534
|
-
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
4535
|
-
const chatPrompt = buildChatPrompt(request);
|
|
4536
|
-
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
4537
|
-
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
4538
|
-
const startMs = Date.now();
|
|
4539
|
-
const result = await withRetry(
|
|
4540
|
-
() => generateText3({
|
|
4541
|
-
model,
|
|
4542
|
-
messages: toModelMessages(chatPrompt),
|
|
4543
|
-
temperature,
|
|
4544
|
-
maxOutputTokens,
|
|
4545
|
-
maxRetries: 0,
|
|
4546
|
-
abortSignal: request.signal,
|
|
4547
|
-
...providerOptions ? { providerOptions } : {}
|
|
4548
|
-
}),
|
|
4549
|
-
retryConfig,
|
|
4550
|
-
request.signal
|
|
4551
|
-
);
|
|
4552
|
-
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
4553
|
-
const durationMs = Date.now() - startMs;
|
|
4554
|
-
return mapResponse(result, { durationMs, startTime, endTime });
|
|
4555
|
-
}
|
|
4556
|
-
function mapResponse(result, timing) {
|
|
4557
|
-
const content = result.text ?? "";
|
|
4558
|
-
const rawUsage = result.totalUsage ?? result.usage;
|
|
4559
|
-
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
4560
|
-
const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
|
|
4561
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
|
|
4562
|
-
input: rawUsage.inputTokens,
|
|
4563
|
-
output: rawUsage.outputTokens,
|
|
4564
|
-
...reasoning != null ? { reasoning } : {},
|
|
4565
|
-
...cached != null ? { cached } : {}
|
|
4566
|
-
} : void 0;
|
|
4567
|
-
return {
|
|
4568
|
-
raw: result,
|
|
4569
|
-
usage: toJsonObject(rawUsage),
|
|
4570
|
-
output: [{ role: "assistant", content }],
|
|
4571
|
-
tokenUsage,
|
|
4572
|
-
durationMs: timing?.durationMs,
|
|
4573
|
-
startTime: timing?.startTime,
|
|
4574
|
-
endTime: timing?.endTime
|
|
4575
|
-
};
|
|
4576
|
-
}
|
|
4577
|
-
function toJsonObject(value) {
|
|
4578
|
-
if (!value || typeof value !== "object") {
|
|
4579
|
-
return void 0;
|
|
4580
|
-
}
|
|
4581
|
-
try {
|
|
4582
|
-
return JSON.parse(JSON.stringify(value));
|
|
4583
|
-
} catch {
|
|
4584
|
-
return void 0;
|
|
4585
|
-
}
|
|
4586
|
-
}
|
|
4587
|
-
function extractStatus(error) {
|
|
4588
|
-
if (!error || typeof error !== "object") {
|
|
4589
|
-
return void 0;
|
|
4590
|
-
}
|
|
4591
|
-
const candidate = error;
|
|
4592
|
-
const directStatus = candidate.status ?? candidate.statusCode;
|
|
4593
|
-
if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
|
|
4594
|
-
return directStatus;
|
|
4595
|
-
}
|
|
4596
|
-
const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
|
|
4597
|
-
if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
|
|
4598
|
-
return responseStatus;
|
|
4599
|
-
}
|
|
4600
|
-
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
4601
|
-
if (message) {
|
|
4602
|
-
const match = message.match(/HTTP\s+(\d{3})/i);
|
|
4603
|
-
if (match) {
|
|
4604
|
-
const parsed = Number.parseInt(match[1], 10);
|
|
4605
|
-
if (Number.isFinite(parsed)) {
|
|
4606
|
-
return parsed;
|
|
4607
|
-
}
|
|
4608
|
-
}
|
|
4609
|
-
}
|
|
4610
|
-
return void 0;
|
|
4611
|
-
}
|
|
4612
|
-
function isNetworkError(error) {
|
|
4613
|
-
if (!error || typeof error !== "object") {
|
|
4614
|
-
return false;
|
|
4615
|
-
}
|
|
4616
|
-
const candidate = error;
|
|
4617
|
-
if (candidate.name === "AbortError") {
|
|
4618
|
-
return false;
|
|
4619
|
-
}
|
|
4620
|
-
const code = candidate.code;
|
|
4621
|
-
if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
|
|
4622
|
-
return true;
|
|
4623
|
-
}
|
|
4624
|
-
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
4625
|
-
if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
|
|
4626
|
-
return true;
|
|
4627
|
-
}
|
|
4628
|
-
return false;
|
|
4629
|
-
}
|
|
4630
|
-
function isRetryableError(error, retryableStatusCodes) {
|
|
4631
|
-
const status = extractStatus(error);
|
|
4632
|
-
if (status === 401 || status === 403) {
|
|
4633
|
-
return false;
|
|
4634
|
-
}
|
|
4635
|
-
if (typeof status === "number") {
|
|
4636
|
-
return retryableStatusCodes.includes(status);
|
|
4637
|
-
}
|
|
4638
|
-
return isNetworkError(error);
|
|
4639
|
-
}
|
|
4640
|
-
function calculateRetryDelay(attempt, config) {
|
|
4641
|
-
const delay = Math.min(
|
|
4642
|
-
config.maxDelayMs,
|
|
4643
|
-
config.initialDelayMs * config.backoffFactor ** attempt
|
|
4644
|
-
);
|
|
4645
|
-
return delay * (0.75 + Math.random() * 0.5);
|
|
4646
|
-
}
|
|
4647
|
-
async function sleep(ms) {
|
|
4648
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
4649
|
-
}
|
|
4650
|
-
async function withRetry(fn, retryConfig, signal) {
|
|
4651
|
-
const config = {
|
|
4652
|
-
maxRetries: retryConfig?.maxRetries ?? 3,
|
|
4653
|
-
initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
|
|
4654
|
-
maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
|
|
4655
|
-
backoffFactor: retryConfig?.backoffFactor ?? 2,
|
|
4656
|
-
retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
|
|
4657
|
-
};
|
|
4658
|
-
let lastError;
|
|
4659
|
-
for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
|
|
4660
|
-
if (signal?.aborted) {
|
|
4661
|
-
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
4662
|
-
}
|
|
4663
|
-
try {
|
|
4664
|
-
return await fn();
|
|
4665
|
-
} catch (error) {
|
|
4666
|
-
lastError = error;
|
|
4667
|
-
if (attempt >= config.maxRetries) {
|
|
4668
|
-
break;
|
|
4669
|
-
}
|
|
4670
|
-
if (!isRetryableError(error, config.retryableStatusCodes)) {
|
|
4671
|
-
throw error;
|
|
4672
|
-
}
|
|
4673
|
-
const delay = calculateRetryDelay(attempt, config);
|
|
4674
|
-
await sleep(delay);
|
|
4675
|
-
}
|
|
4676
|
-
}
|
|
4677
|
-
throw lastError;
|
|
4678
|
-
}
|
|
4679
|
-
|
|
4680
4258
|
// src/evaluation/providers/claude-cli.ts
|
|
4681
4259
|
import { spawn } from "node:child_process";
|
|
4682
4260
|
import { randomUUID } from "node:crypto";
|
|
@@ -9054,10 +8632,10 @@ function extractToolCallsFromEvents(events) {
|
|
|
9054
8632
|
}
|
|
9055
8633
|
}
|
|
9056
8634
|
const toolCalls = [];
|
|
9057
|
-
for (const [id, { tool
|
|
8635
|
+
for (const [id, { tool, input }] of starts) {
|
|
9058
8636
|
toolCalls.push(
|
|
9059
8637
|
normalizeToolCall("pi-cli", {
|
|
9060
|
-
tool
|
|
8638
|
+
tool,
|
|
9061
8639
|
input,
|
|
9062
8640
|
id: id.startsWith("anon-") ? void 0 : id,
|
|
9063
8641
|
output: results.get(id)
|
|
@@ -10124,7 +9702,7 @@ import { readFile as readFile5 } from "node:fs/promises";
|
|
|
10124
9702
|
import path20 from "node:path";
|
|
10125
9703
|
|
|
10126
9704
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
10127
|
-
function
|
|
9705
|
+
function sleep(ms) {
|
|
10128
9706
|
return new Promise((resolve) => {
|
|
10129
9707
|
setTimeout(resolve, ms);
|
|
10130
9708
|
});
|
|
@@ -10147,7 +9725,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
10147
9725
|
}
|
|
10148
9726
|
return false;
|
|
10149
9727
|
}
|
|
10150
|
-
await
|
|
9728
|
+
await sleep(pollInterval);
|
|
10151
9729
|
}
|
|
10152
9730
|
} catch (error) {
|
|
10153
9731
|
if (error.code === "ENOENT") {
|
|
@@ -10173,7 +9751,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
10173
9751
|
}
|
|
10174
9752
|
return false;
|
|
10175
9753
|
}
|
|
10176
|
-
await
|
|
9754
|
+
await sleep(pollInterval);
|
|
10177
9755
|
}
|
|
10178
9756
|
}
|
|
10179
9757
|
return false;
|
|
@@ -10202,7 +9780,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
10202
9780
|
}
|
|
10203
9781
|
}
|
|
10204
9782
|
if (pending.size > 0) {
|
|
10205
|
-
await
|
|
9783
|
+
await sleep(pollInterval);
|
|
10206
9784
|
}
|
|
10207
9785
|
}
|
|
10208
9786
|
} catch (error) {
|
|
@@ -10230,7 +9808,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
10230
9808
|
}
|
|
10231
9809
|
return false;
|
|
10232
9810
|
}
|
|
10233
|
-
await
|
|
9811
|
+
await sleep(pollInterval);
|
|
10234
9812
|
}
|
|
10235
9813
|
}
|
|
10236
9814
|
}
|
|
@@ -10326,7 +9904,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
10326
9904
|
label: "open-workspace"
|
|
10327
9905
|
});
|
|
10328
9906
|
await raceSpawnError(workspaceChild);
|
|
10329
|
-
await
|
|
9907
|
+
await sleep(100);
|
|
10330
9908
|
const wakeupChatId = "wakeup";
|
|
10331
9909
|
const chatArgs = [
|
|
10332
9910
|
"-r",
|
|
@@ -10343,7 +9921,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
10343
9921
|
console.error(`warning: Workspace readiness timeout after ${timeout}s`);
|
|
10344
9922
|
return false;
|
|
10345
9923
|
}
|
|
10346
|
-
await
|
|
9924
|
+
await sleep(pollInterval * 1e3);
|
|
10347
9925
|
}
|
|
10348
9926
|
return true;
|
|
10349
9927
|
}
|
|
@@ -10371,7 +9949,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
|
|
|
10371
9949
|
`VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
|
|
10372
9950
|
);
|
|
10373
9951
|
}
|
|
10374
|
-
await
|
|
9952
|
+
await sleep(500);
|
|
10375
9953
|
const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-chat" });
|
|
10376
9954
|
await raceSpawnError(child);
|
|
10377
9955
|
}
|
|
@@ -10395,7 +9973,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
|
|
|
10395
9973
|
`VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
|
|
10396
9974
|
);
|
|
10397
9975
|
}
|
|
10398
|
-
await
|
|
9976
|
+
await sleep(500);
|
|
10399
9977
|
const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-batch-chat" });
|
|
10400
9978
|
await raceSpawnError(child);
|
|
10401
9979
|
}
|
|
@@ -16105,7 +15683,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
16105
15683
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
16106
15684
|
}
|
|
16107
15685
|
if (format === "typescript") {
|
|
16108
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
15686
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-E6MROJGR.js");
|
|
16109
15687
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
16110
15688
|
}
|
|
16111
15689
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -16140,7 +15718,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
16140
15718
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
16141
15719
|
}
|
|
16142
15720
|
if (format === "typescript") {
|
|
16143
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
15721
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-E6MROJGR.js");
|
|
16144
15722
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
16145
15723
|
return suite.tests;
|
|
16146
15724
|
}
|
|
@@ -16496,7 +16074,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
16496
16074
|
const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
|
|
16497
16075
|
const mode = explicitMode ?? (workspacePath ? "static" : void 0);
|
|
16498
16076
|
const docker = parseDockerWorkspaceConfig(obj.docker);
|
|
16499
|
-
|
|
16077
|
+
const env = parseWorkspaceEnvConfig(obj.env);
|
|
16078
|
+
if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
|
|
16500
16079
|
return void 0;
|
|
16501
16080
|
return {
|
|
16502
16081
|
...template !== void 0 && { template },
|
|
@@ -16505,7 +16084,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
16505
16084
|
...hooks !== void 0 && { hooks },
|
|
16506
16085
|
...mode !== void 0 && { mode },
|
|
16507
16086
|
...workspacePath !== void 0 && { path: workspacePath },
|
|
16508
|
-
...docker !== void 0 && { docker }
|
|
16087
|
+
...docker !== void 0 && { docker },
|
|
16088
|
+
...env !== void 0 && { env }
|
|
16089
|
+
};
|
|
16090
|
+
}
|
|
16091
|
+
function parseWorkspaceEnvConfig(raw) {
|
|
16092
|
+
if (!isJsonObject(raw)) return void 0;
|
|
16093
|
+
const obj = raw;
|
|
16094
|
+
const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
|
|
16095
|
+
const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
|
|
16096
|
+
if (!required_commands?.length && !required_python_modules?.length) return void 0;
|
|
16097
|
+
return {
|
|
16098
|
+
...required_commands?.length && { required_commands },
|
|
16099
|
+
...required_python_modules?.length && { required_python_modules }
|
|
16509
16100
|
};
|
|
16510
16101
|
}
|
|
16511
16102
|
function parseDockerWorkspaceConfig(raw) {
|
|
@@ -16865,7 +16456,7 @@ async function runEvaluation(options) {
|
|
|
16865
16456
|
if (!cliModel) {
|
|
16866
16457
|
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
16867
16458
|
}
|
|
16868
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-
|
|
16459
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-MUIGGIP3.js");
|
|
16869
16460
|
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
16870
16461
|
}
|
|
16871
16462
|
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
@@ -17196,6 +16787,19 @@ async function runEvaluation(options) {
|
|
|
17196
16787
|
await dockerSetup.pullImage();
|
|
17197
16788
|
setupLog("Docker image pull complete");
|
|
17198
16789
|
}
|
|
16790
|
+
if (suiteWorkspace?.env) {
|
|
16791
|
+
try {
|
|
16792
|
+
await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
|
|
16793
|
+
setupLog("preflight checks passed");
|
|
16794
|
+
} catch (error) {
|
|
16795
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16796
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
16797
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16798
|
+
});
|
|
16799
|
+
}
|
|
16800
|
+
throw new Error(message);
|
|
16801
|
+
}
|
|
16802
|
+
}
|
|
17199
16803
|
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
|
|
17200
16804
|
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
17201
16805
|
if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
@@ -18220,7 +17824,7 @@ async function runEvalCase(options) {
|
|
|
18220
17824
|
lastError = error;
|
|
18221
17825
|
if (attempt + 1 < attemptBudget) {
|
|
18222
17826
|
const delayMs = retryBackoffMs(attempt);
|
|
18223
|
-
await
|
|
17827
|
+
await sleep2(delayMs, signal);
|
|
18224
17828
|
attempt += 1;
|
|
18225
17829
|
continue;
|
|
18226
17830
|
}
|
|
@@ -19425,7 +19029,7 @@ function extractErrorMessage(error) {
|
|
|
19425
19029
|
function retryBackoffMs(attempt) {
|
|
19426
19030
|
return Math.min(2 ** attempt * 1e3, 3e4);
|
|
19427
19031
|
}
|
|
19428
|
-
function
|
|
19032
|
+
function sleep2(ms, signal) {
|
|
19429
19033
|
if (signal?.aborted) return Promise.resolve();
|
|
19430
19034
|
return new Promise((resolve) => {
|
|
19431
19035
|
const timer = setTimeout(resolve, ms);
|
|
@@ -19466,6 +19070,38 @@ function computeWeightedMean(entries) {
|
|
|
19466
19070
|
}
|
|
19467
19071
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
19468
19072
|
}
|
|
19073
|
+
async function runPreflightChecks(env, cwd, log) {
|
|
19074
|
+
const execFileAsync4 = promisify7(execFile3);
|
|
19075
|
+
const missing = [];
|
|
19076
|
+
for (const cmd of env.required_commands ?? []) {
|
|
19077
|
+
log(`preflight: checking command "${cmd}"`);
|
|
19078
|
+
try {
|
|
19079
|
+
if (process.platform === "win32") {
|
|
19080
|
+
await execFileAsync4("where", [cmd], { cwd });
|
|
19081
|
+
} else {
|
|
19082
|
+
await execFileAsync4("sh", ["-c", `command -v ${cmd}`], { cwd });
|
|
19083
|
+
}
|
|
19084
|
+
} catch {
|
|
19085
|
+
missing.push(`command: ${cmd}`);
|
|
19086
|
+
}
|
|
19087
|
+
}
|
|
19088
|
+
for (const mod of env.required_python_modules ?? []) {
|
|
19089
|
+
log(`preflight: checking Python module "${mod}"`);
|
|
19090
|
+
try {
|
|
19091
|
+
await execFileAsync4("python3", ["-c", `import ${mod}`], { cwd });
|
|
19092
|
+
} catch {
|
|
19093
|
+
missing.push(`python module: ${mod}`);
|
|
19094
|
+
}
|
|
19095
|
+
}
|
|
19096
|
+
if (missing.length > 0) {
|
|
19097
|
+
throw new Error(
|
|
19098
|
+
`Preflight checks failed \u2014 missing dependencies:
|
|
19099
|
+
${missing.map((m) => ` \u2022 ${m}`).join("\n")}
|
|
19100
|
+
|
|
19101
|
+
Install the missing dependencies before running this eval.`
|
|
19102
|
+
);
|
|
19103
|
+
}
|
|
19104
|
+
}
|
|
19469
19105
|
|
|
19470
19106
|
// src/evaluation/providers/function-provider.ts
|
|
19471
19107
|
function createFunctionProvider(taskFn) {
|
|
@@ -19954,4 +19590,4 @@ export {
|
|
|
19954
19590
|
loadTestById,
|
|
19955
19591
|
loadEvalCaseById
|
|
19956
19592
|
};
|
|
19957
|
-
//# sourceMappingURL=chunk-
|
|
19593
|
+
//# sourceMappingURL=chunk-EVEZQXIS.js.map
|