@agentv/core 4.25.1 → 4.25.2-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-MUIGGIP3.js +7 -0
- package/dist/chunk-5XV3FAAD.js +616 -0
- package/dist/chunk-5XV3FAAD.js.map +1 -0
- package/dist/{chunk-6HLBKYE2.js → chunk-CALQDF2Y.js} +1 -1
- package/dist/chunk-CALQDF2Y.js.map +1 -0
- package/dist/{chunk-IXTJEXWN.js → chunk-F234XBWV.js} +185 -551
- package/dist/chunk-F234XBWV.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +589 -419
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +78 -8
- package/dist/index.d.ts +78 -8
- package/dist/index.js +7 -12
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-5JMF2N65.js +12 -0
- package/package.json +2 -7
- package/dist/agentv-provider-TXM4UEUT.js +0 -7
- package/dist/chunk-6HLBKYE2.js.map +0 -1
- package/dist/chunk-IXTJEXWN.js.map +0 -1
- package/dist/chunk-PRNXHNLF.js +0 -65
- package/dist/chunk-PRNXHNLF.js.map +0 -1
- package/dist/ts-eval-loader-4CFPGHGT.js +0 -12
- /package/dist/{agentv-provider-TXM4UEUT.js.map → agentv-provider-MUIGGIP3.js.map} +0 -0
- /package/dist/{ts-eval-loader-4CFPGHGT.js.map → ts-eval-loader-5JMF2N65.js.map} +0 -0
|
@@ -17,14 +17,19 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveDelegatedTargetDefinition,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-CALQDF2Y.js";
|
|
21
21
|
import {
|
|
22
22
|
execFileWithStdin,
|
|
23
23
|
execShellWithStdin
|
|
24
24
|
} from "./chunk-3WGHC7LC.js";
|
|
25
25
|
import {
|
|
26
|
-
AgentvProvider
|
|
27
|
-
|
|
26
|
+
AgentvProvider,
|
|
27
|
+
AnthropicProvider,
|
|
28
|
+
AzureProvider,
|
|
29
|
+
GeminiProvider,
|
|
30
|
+
OpenAIProvider,
|
|
31
|
+
OpenRouterProvider
|
|
32
|
+
} from "./chunk-5XV3FAAD.js";
|
|
28
33
|
|
|
29
34
|
// src/evaluation/loaders/ts-eval-loader.ts
|
|
30
35
|
import path46 from "node:path";
|
|
@@ -730,6 +735,8 @@ var CodeGrader = class {
|
|
|
730
735
|
const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
|
|
731
736
|
try {
|
|
732
737
|
let stdout;
|
|
738
|
+
let exitCode = 0;
|
|
739
|
+
let execStderr = "";
|
|
733
740
|
if (context.dockerConfig) {
|
|
734
741
|
const { DockerWorkspaceProvider } = await import("./docker-workspace-RPPXBT27.js");
|
|
735
742
|
const dockerProvider = new DockerWorkspaceProvider(context.dockerConfig);
|
|
@@ -738,31 +745,40 @@ var CodeGrader = class {
|
|
|
738
745
|
stdin: inputPayload,
|
|
739
746
|
repoCheckouts: getRepoCheckoutTargets(context.evalCase.workspace?.repos)
|
|
740
747
|
});
|
|
741
|
-
|
|
742
|
-
const trimmedErr = result.stderr.trim();
|
|
743
|
-
throw new Error(
|
|
744
|
-
trimmedErr.length > 0 ? `Code evaluator exited with code ${result.exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${result.exitCode}`
|
|
745
|
-
);
|
|
746
|
-
}
|
|
748
|
+
exitCode = result.exitCode;
|
|
747
749
|
stdout = result.stdout.trim();
|
|
750
|
+
execStderr = result.stderr;
|
|
748
751
|
} else {
|
|
749
|
-
|
|
752
|
+
const result = await runScriptRaw(
|
|
750
753
|
this.command,
|
|
751
754
|
inputPayload,
|
|
752
755
|
this.agentTimeoutMs,
|
|
753
756
|
this.cwd,
|
|
754
757
|
env
|
|
755
758
|
);
|
|
759
|
+
exitCode = result.exitCode;
|
|
760
|
+
stdout = result.stdout.trim();
|
|
761
|
+
execStderr = result.stderr;
|
|
756
762
|
}
|
|
757
|
-
const
|
|
758
|
-
const
|
|
759
|
-
|
|
763
|
+
const looksLikeJson = stdout.startsWith("{") || stdout.startsWith("[");
|
|
764
|
+
const hasStderr = execStderr.trim().length > 0;
|
|
765
|
+
if (exitCode !== 0 && (looksLikeJson || hasStderr)) {
|
|
766
|
+
const trimmedErr = formatStderr(execStderr);
|
|
767
|
+
throw new Error(
|
|
768
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
769
|
+
);
|
|
770
|
+
}
|
|
771
|
+
const rawParsed = parseJsonSafe(stdout);
|
|
772
|
+
const parsed = rawParsed != null && typeof rawParsed === "object" && !Array.isArray(rawParsed) ? rawParsed : void 0;
|
|
773
|
+
const passed = exitCode === 0;
|
|
774
|
+
const score = parsed != null ? clampScore(typeof parsed.score === "number" ? parsed.score : 0) : passed ? 1 : 0;
|
|
775
|
+
const assertions = parsed != null && Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
760
776
|
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
761
777
|
).map((a) => ({
|
|
762
778
|
text: String(a.text),
|
|
763
779
|
passed: Boolean(a.passed),
|
|
764
780
|
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
765
|
-
})) : [];
|
|
781
|
+
})) : parsed == null ? [{ text: stdout.trim() || (passed ? "exit 0" : `exit ${exitCode}`), passed }] : [];
|
|
766
782
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
767
783
|
const proxyUsage = getProxyUsage?.();
|
|
768
784
|
const graderRawRequest = {
|
|
@@ -820,8 +836,17 @@ var CodeGrader = class {
|
|
|
820
836
|
}
|
|
821
837
|
}
|
|
822
838
|
};
|
|
839
|
+
async function runScriptRaw(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
840
|
+
return typeof scriptPath === "string" ? execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
|
|
841
|
+
}
|
|
823
842
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
|
|
824
|
-
const { stdout, stderr, exitCode } =
|
|
843
|
+
const { stdout, stderr, exitCode } = await runScriptRaw(
|
|
844
|
+
scriptPath,
|
|
845
|
+
input,
|
|
846
|
+
agentTimeoutMs,
|
|
847
|
+
cwd,
|
|
848
|
+
env
|
|
849
|
+
);
|
|
825
850
|
if (exitCode !== 0) {
|
|
826
851
|
const trimmedErr = formatStderr(stderr);
|
|
827
852
|
throw new Error(
|
|
@@ -841,13 +866,9 @@ function formatStderr(stderr) {
|
|
|
841
866
|
${tail}`;
|
|
842
867
|
}
|
|
843
868
|
|
|
844
|
-
// src/evaluation/graders/composite.ts
|
|
845
|
-
import { generateText as generateText2 } from "ai";
|
|
846
|
-
|
|
847
869
|
// src/evaluation/graders/llm-grader.ts
|
|
848
870
|
import fs from "node:fs/promises";
|
|
849
871
|
import path3 from "node:path";
|
|
850
|
-
import { generateText, stepCountIs, tool } from "ai";
|
|
851
872
|
import { z } from "zod";
|
|
852
873
|
|
|
853
874
|
// src/evaluation/content-preprocessor.ts
|
|
@@ -1357,18 +1378,15 @@ ${context.toolCalls}`;
|
|
|
1357
1378
|
}
|
|
1358
1379
|
}
|
|
1359
1380
|
// ---------------------------------------------------------------------------
|
|
1360
|
-
// Built-in agent mode (agentv provider —
|
|
1381
|
+
// Built-in agent mode (agentv provider — provider.invoke() with filesystem tools)
|
|
1361
1382
|
// ---------------------------------------------------------------------------
|
|
1362
1383
|
/**
|
|
1363
|
-
* Built-in mode:
|
|
1384
|
+
* Built-in mode: drives the grader through provider.invoke() with the
|
|
1385
|
+
* sandboxed filesystem tools and a step budget. The pi-ai-backed agentv
|
|
1386
|
+
* provider runs the agent loop (tool call → tool execute → next model
|
|
1387
|
+
* turn) until the model stops requesting tools or maxSteps is hit.
|
|
1364
1388
|
*/
|
|
1365
1389
|
async evaluateBuiltIn(context, graderProvider) {
|
|
1366
|
-
const model = graderProvider.asLanguageModel?.();
|
|
1367
|
-
if (!model) {
|
|
1368
|
-
throw new Error(
|
|
1369
|
-
`Grader provider '${graderProvider.targetName}' does not support asLanguageModel() \u2014 required for built-in agent mode`
|
|
1370
|
-
);
|
|
1371
|
-
}
|
|
1372
1390
|
const workspacePath = context.workspacePath;
|
|
1373
1391
|
if (!workspacePath) {
|
|
1374
1392
|
throw new Error(
|
|
@@ -1387,18 +1405,21 @@ ${context.toolCalls}`;
|
|
|
1387
1405
|
maxSteps: this.maxSteps
|
|
1388
1406
|
};
|
|
1389
1407
|
try {
|
|
1390
|
-
const
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1408
|
+
const response = await graderProvider.invoke({
|
|
1409
|
+
question: userPrompt,
|
|
1410
|
+
systemPrompt,
|
|
1411
|
+
evalCaseId: context.evalCase.id,
|
|
1412
|
+
attempt: context.attempt,
|
|
1413
|
+
temperature: this.temperature ?? 0,
|
|
1394
1414
|
tools: fsTools,
|
|
1395
|
-
|
|
1396
|
-
temperature: this.temperature ?? 0
|
|
1415
|
+
maxSteps: this.maxSteps
|
|
1397
1416
|
});
|
|
1398
|
-
const
|
|
1417
|
+
const text = extractLastAssistantContent(response.output);
|
|
1418
|
+
const stepCount = response.steps?.count ?? 1;
|
|
1419
|
+
const toolCallCount = response.steps?.toolCallCount ?? 0;
|
|
1399
1420
|
const details = {
|
|
1400
1421
|
mode: "built-in",
|
|
1401
|
-
steps:
|
|
1422
|
+
steps: stepCount,
|
|
1402
1423
|
tool_calls: toolCallCount
|
|
1403
1424
|
};
|
|
1404
1425
|
return this.parseAgentResult(
|
|
@@ -1850,43 +1871,14 @@ ${outputSchema}`;
|
|
|
1850
1871
|
}
|
|
1851
1872
|
async generateStructuredResponse(options) {
|
|
1852
1873
|
const { context, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
1853
|
-
const model = graderProvider.asLanguageModel?.();
|
|
1854
|
-
if (model) {
|
|
1855
|
-
const modelOptions = {
|
|
1856
|
-
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
1857
|
-
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
1858
|
-
};
|
|
1859
|
-
const hasImages = images && images.length > 0;
|
|
1860
|
-
const result = hasImages ? await generateText({
|
|
1861
|
-
model,
|
|
1862
|
-
system: systemPrompt,
|
|
1863
|
-
messages: [
|
|
1864
|
-
{
|
|
1865
|
-
role: "user",
|
|
1866
|
-
content: [
|
|
1867
|
-
{ type: "text", text: userPrompt },
|
|
1868
|
-
...toAiSdkImageParts(images)
|
|
1869
|
-
]
|
|
1870
|
-
}
|
|
1871
|
-
],
|
|
1872
|
-
...modelOptions
|
|
1873
|
-
}) : await generateText({
|
|
1874
|
-
model,
|
|
1875
|
-
system: systemPrompt,
|
|
1876
|
-
prompt: userPrompt,
|
|
1877
|
-
...modelOptions
|
|
1878
|
-
});
|
|
1879
|
-
const rawUsage = result.usage;
|
|
1880
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
1881
|
-
return { text: result.text, tokenUsage };
|
|
1882
|
-
}
|
|
1883
1874
|
const response = await graderProvider.invoke({
|
|
1884
1875
|
question: userPrompt,
|
|
1885
1876
|
systemPrompt,
|
|
1886
1877
|
evalCaseId: context.evalCase.id,
|
|
1887
1878
|
attempt: context.attempt,
|
|
1888
1879
|
maxOutputTokens: this.maxOutputTokens,
|
|
1889
|
-
temperature: this.temperature
|
|
1880
|
+
temperature: this.temperature,
|
|
1881
|
+
...images && images.length > 0 ? { images } : {}
|
|
1890
1882
|
});
|
|
1891
1883
|
return {
|
|
1892
1884
|
text: extractLastAssistantContent(response.output),
|
|
@@ -2083,13 +2075,6 @@ function extractImageBlocks(messages) {
|
|
|
2083
2075
|
}
|
|
2084
2076
|
return images;
|
|
2085
2077
|
}
|
|
2086
|
-
function toAiSdkImageParts(images) {
|
|
2087
|
-
return images.map((img) => ({
|
|
2088
|
-
type: "image",
|
|
2089
|
-
image: img.source,
|
|
2090
|
-
mediaType: img.media_type || void 0
|
|
2091
|
-
}));
|
|
2092
|
-
}
|
|
2093
2078
|
function resolveSandboxed(basePath, relativePath) {
|
|
2094
2079
|
const resolved = path3.resolve(basePath, relativePath);
|
|
2095
2080
|
if (!resolved.startsWith(basePath + path3.sep) && resolved !== basePath) {
|
|
@@ -2098,15 +2083,24 @@ function resolveSandboxed(basePath, relativePath) {
|
|
|
2098
2083
|
return resolved;
|
|
2099
2084
|
}
|
|
2100
2085
|
function createFilesystemTools(workspacePath) {
|
|
2101
|
-
return
|
|
2102
|
-
|
|
2086
|
+
return [
|
|
2087
|
+
{
|
|
2088
|
+
name: "list_files",
|
|
2103
2089
|
description: "List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).",
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2090
|
+
parameters: {
|
|
2091
|
+
type: "object",
|
|
2092
|
+
properties: {
|
|
2093
|
+
path: {
|
|
2094
|
+
type: "string",
|
|
2095
|
+
description: 'Relative path within workspace (use "." for root)',
|
|
2096
|
+
default: "."
|
|
2097
|
+
}
|
|
2098
|
+
}
|
|
2099
|
+
},
|
|
2107
2100
|
execute: async (input) => {
|
|
2101
|
+
const args = input ?? {};
|
|
2108
2102
|
try {
|
|
2109
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
2103
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
2110
2104
|
const entries = await fs.readdir(resolved, { withFileTypes: true });
|
|
2111
2105
|
return entries.map((e) => ({
|
|
2112
2106
|
name: e.name,
|
|
@@ -2116,18 +2110,25 @@ function createFilesystemTools(workspacePath) {
|
|
|
2116
2110
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
2117
2111
|
}
|
|
2118
2112
|
}
|
|
2119
|
-
}
|
|
2120
|
-
|
|
2113
|
+
},
|
|
2114
|
+
{
|
|
2115
|
+
name: "read_file",
|
|
2121
2116
|
description: "Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.",
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2117
|
+
parameters: {
|
|
2118
|
+
type: "object",
|
|
2119
|
+
properties: {
|
|
2120
|
+
path: { type: "string", description: "Relative path to file within workspace" }
|
|
2121
|
+
},
|
|
2122
|
+
required: ["path"]
|
|
2123
|
+
},
|
|
2125
2124
|
execute: async (input) => {
|
|
2125
|
+
const args = input ?? {};
|
|
2126
|
+
const relPath = args.path ?? "";
|
|
2126
2127
|
try {
|
|
2127
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
2128
|
+
const resolved = resolveSandboxed(workspacePath, relPath);
|
|
2128
2129
|
const stat10 = await fs.stat(resolved);
|
|
2129
2130
|
if (stat10.isDirectory()) {
|
|
2130
|
-
return { error: `'${
|
|
2131
|
+
return { error: `'${relPath}' is a directory, not a file` };
|
|
2131
2132
|
}
|
|
2132
2133
|
const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
|
|
2133
2134
|
const fd = await fs.open(resolved, "r");
|
|
@@ -2143,19 +2144,29 @@ function createFilesystemTools(workspacePath) {
|
|
|
2143
2144
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
2144
2145
|
}
|
|
2145
2146
|
}
|
|
2146
|
-
}
|
|
2147
|
-
|
|
2147
|
+
},
|
|
2148
|
+
{
|
|
2149
|
+
name: "search_files",
|
|
2148
2150
|
description: "Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.",
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2151
|
+
parameters: {
|
|
2152
|
+
type: "object",
|
|
2153
|
+
properties: {
|
|
2154
|
+
pattern: { type: "string", description: "Regex pattern to search for" },
|
|
2155
|
+
path: {
|
|
2156
|
+
type: "string",
|
|
2157
|
+
description: 'Relative path to search within (use "." for root)',
|
|
2158
|
+
default: "."
|
|
2159
|
+
}
|
|
2160
|
+
},
|
|
2161
|
+
required: ["pattern"]
|
|
2162
|
+
},
|
|
2153
2163
|
execute: async (input) => {
|
|
2164
|
+
const args = input ?? {};
|
|
2154
2165
|
try {
|
|
2155
|
-
const resolved = resolveSandboxed(workspacePath,
|
|
2166
|
+
const resolved = resolveSandboxed(workspacePath, args.path ?? ".");
|
|
2156
2167
|
let regex;
|
|
2157
2168
|
try {
|
|
2158
|
-
regex = new RegExp(
|
|
2169
|
+
regex = new RegExp(args.pattern ?? "", "gi");
|
|
2159
2170
|
} catch (regexErr) {
|
|
2160
2171
|
return {
|
|
2161
2172
|
error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`
|
|
@@ -2168,8 +2179,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
2168
2179
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
2169
2180
|
}
|
|
2170
2181
|
}
|
|
2171
|
-
}
|
|
2172
|
-
|
|
2182
|
+
}
|
|
2183
|
+
];
|
|
2173
2184
|
}
|
|
2174
2185
|
async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
2175
2186
|
if (matches.length >= MAX_SEARCH_MATCHES) return;
|
|
@@ -2449,25 +2460,6 @@ var CompositeGrader = class {
|
|
|
2449
2460
|
target: graderProvider.targetName
|
|
2450
2461
|
};
|
|
2451
2462
|
try {
|
|
2452
|
-
const model = graderProvider.asLanguageModel?.();
|
|
2453
|
-
if (model) {
|
|
2454
|
-
const { text } = await generateText2({
|
|
2455
|
-
model,
|
|
2456
|
-
system: systemPrompt,
|
|
2457
|
-
prompt: userPrompt
|
|
2458
|
-
});
|
|
2459
|
-
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
2460
|
-
const score2 = clampScore(data2.score);
|
|
2461
|
-
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
2462
|
-
return {
|
|
2463
|
-
score: score2,
|
|
2464
|
-
verdict: scoreToVerdict(score2),
|
|
2465
|
-
assertions: assertions2,
|
|
2466
|
-
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
2467
|
-
graderRawRequest,
|
|
2468
|
-
scores
|
|
2469
|
-
};
|
|
2470
|
-
}
|
|
2471
2463
|
const response = await graderProvider.invoke({
|
|
2472
2464
|
question: userPrompt,
|
|
2473
2465
|
systemPrompt,
|
|
@@ -2625,7 +2617,7 @@ var DEFAULT_EXPLORATION_TOOLS = [
|
|
|
2625
2617
|
function explorationRatio(summary, explorationTools = DEFAULT_EXPLORATION_TOOLS) {
|
|
2626
2618
|
if (summary.eventCount === 0) return void 0;
|
|
2627
2619
|
const explorationCalls = explorationTools.reduce(
|
|
2628
|
-
(sum,
|
|
2620
|
+
(sum, tool) => sum + (summary.toolCalls[tool] ?? 0),
|
|
2629
2621
|
0
|
|
2630
2622
|
);
|
|
2631
2623
|
return explorationCalls / summary.eventCount;
|
|
@@ -4261,422 +4253,6 @@ function runEqualsAssertion(output, value) {
|
|
|
4261
4253
|
};
|
|
4262
4254
|
}
|
|
4263
4255
|
|
|
4264
|
-
// src/evaluation/providers/ai-sdk.ts
|
|
4265
|
-
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
4266
|
-
import { createAzure } from "@ai-sdk/azure";
|
|
4267
|
-
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
|
4268
|
-
import { createOpenAI } from "@ai-sdk/openai";
|
|
4269
|
-
import { createOpenRouter } from "@openrouter/ai-sdk-provider";
|
|
4270
|
-
import { generateText as generateText3 } from "ai";
|
|
4271
|
-
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
4272
|
-
var OpenAIProvider = class {
|
|
4273
|
-
constructor(targetName, config) {
|
|
4274
|
-
this.config = config;
|
|
4275
|
-
this.id = `openai:${targetName}`;
|
|
4276
|
-
this.targetName = targetName;
|
|
4277
|
-
this.defaults = {
|
|
4278
|
-
temperature: config.temperature,
|
|
4279
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4280
|
-
};
|
|
4281
|
-
this.retryConfig = config.retry;
|
|
4282
|
-
const openai = createOpenAI({
|
|
4283
|
-
apiKey: config.apiKey,
|
|
4284
|
-
baseURL: config.baseURL
|
|
4285
|
-
});
|
|
4286
|
-
this.model = config.apiFormat === "responses" ? openai(config.model) : openai.chat(config.model);
|
|
4287
|
-
}
|
|
4288
|
-
id;
|
|
4289
|
-
kind = "openai";
|
|
4290
|
-
targetName;
|
|
4291
|
-
model;
|
|
4292
|
-
defaults;
|
|
4293
|
-
retryConfig;
|
|
4294
|
-
async invoke(request) {
|
|
4295
|
-
return invokeModel({
|
|
4296
|
-
model: this.model,
|
|
4297
|
-
request,
|
|
4298
|
-
defaults: this.defaults,
|
|
4299
|
-
retryConfig: this.retryConfig
|
|
4300
|
-
});
|
|
4301
|
-
}
|
|
4302
|
-
asLanguageModel() {
|
|
4303
|
-
return this.model;
|
|
4304
|
-
}
|
|
4305
|
-
};
|
|
4306
|
-
var AzureProvider = class {
|
|
4307
|
-
constructor(targetName, config) {
|
|
4308
|
-
this.config = config;
|
|
4309
|
-
this.id = `azure:${targetName}`;
|
|
4310
|
-
this.targetName = targetName;
|
|
4311
|
-
this.defaults = {
|
|
4312
|
-
temperature: config.temperature,
|
|
4313
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4314
|
-
};
|
|
4315
|
-
this.retryConfig = config.retry;
|
|
4316
|
-
const azure = createAzure(buildAzureOptions(config));
|
|
4317
|
-
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
4318
|
-
}
|
|
4319
|
-
id;
|
|
4320
|
-
kind = "azure";
|
|
4321
|
-
targetName;
|
|
4322
|
-
model;
|
|
4323
|
-
defaults;
|
|
4324
|
-
retryConfig;
|
|
4325
|
-
async invoke(request) {
|
|
4326
|
-
return invokeModel({
|
|
4327
|
-
model: this.model,
|
|
4328
|
-
request,
|
|
4329
|
-
defaults: this.defaults,
|
|
4330
|
-
retryConfig: this.retryConfig
|
|
4331
|
-
});
|
|
4332
|
-
}
|
|
4333
|
-
asLanguageModel() {
|
|
4334
|
-
return this.model;
|
|
4335
|
-
}
|
|
4336
|
-
};
|
|
4337
|
-
var OpenRouterProvider = class {
|
|
4338
|
-
constructor(targetName, config) {
|
|
4339
|
-
this.config = config;
|
|
4340
|
-
this.id = `openrouter:${targetName}`;
|
|
4341
|
-
this.targetName = targetName;
|
|
4342
|
-
this.defaults = {
|
|
4343
|
-
temperature: config.temperature,
|
|
4344
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4345
|
-
};
|
|
4346
|
-
this.retryConfig = config.retry;
|
|
4347
|
-
const openrouter = createOpenRouter({
|
|
4348
|
-
apiKey: config.apiKey
|
|
4349
|
-
});
|
|
4350
|
-
this.model = openrouter(config.model);
|
|
4351
|
-
}
|
|
4352
|
-
id;
|
|
4353
|
-
kind = "openrouter";
|
|
4354
|
-
targetName;
|
|
4355
|
-
model;
|
|
4356
|
-
defaults;
|
|
4357
|
-
retryConfig;
|
|
4358
|
-
async invoke(request) {
|
|
4359
|
-
return invokeModel({
|
|
4360
|
-
model: this.model,
|
|
4361
|
-
request,
|
|
4362
|
-
defaults: this.defaults,
|
|
4363
|
-
retryConfig: this.retryConfig
|
|
4364
|
-
});
|
|
4365
|
-
}
|
|
4366
|
-
asLanguageModel() {
|
|
4367
|
-
return this.model;
|
|
4368
|
-
}
|
|
4369
|
-
};
|
|
4370
|
-
var AnthropicProvider = class {
|
|
4371
|
-
constructor(targetName, config) {
|
|
4372
|
-
this.config = config;
|
|
4373
|
-
this.id = `anthropic:${targetName}`;
|
|
4374
|
-
this.targetName = targetName;
|
|
4375
|
-
this.defaults = {
|
|
4376
|
-
temperature: config.temperature,
|
|
4377
|
-
maxOutputTokens: config.maxOutputTokens,
|
|
4378
|
-
thinkingBudget: config.thinkingBudget
|
|
4379
|
-
};
|
|
4380
|
-
this.retryConfig = config.retry;
|
|
4381
|
-
const anthropic = createAnthropic({
|
|
4382
|
-
apiKey: config.apiKey
|
|
4383
|
-
});
|
|
4384
|
-
this.model = anthropic(config.model);
|
|
4385
|
-
}
|
|
4386
|
-
id;
|
|
4387
|
-
kind = "anthropic";
|
|
4388
|
-
targetName;
|
|
4389
|
-
model;
|
|
4390
|
-
defaults;
|
|
4391
|
-
retryConfig;
|
|
4392
|
-
async invoke(request) {
|
|
4393
|
-
const providerOptions = buildAnthropicProviderOptions(this.defaults);
|
|
4394
|
-
return invokeModel({
|
|
4395
|
-
model: this.model,
|
|
4396
|
-
request,
|
|
4397
|
-
defaults: this.defaults,
|
|
4398
|
-
retryConfig: this.retryConfig,
|
|
4399
|
-
providerOptions
|
|
4400
|
-
});
|
|
4401
|
-
}
|
|
4402
|
-
asLanguageModel() {
|
|
4403
|
-
return this.model;
|
|
4404
|
-
}
|
|
4405
|
-
};
|
|
4406
|
-
var GeminiProvider = class {
|
|
4407
|
-
constructor(targetName, config) {
|
|
4408
|
-
this.config = config;
|
|
4409
|
-
this.id = `gemini:${targetName}`;
|
|
4410
|
-
this.targetName = targetName;
|
|
4411
|
-
this.defaults = {
|
|
4412
|
-
temperature: config.temperature,
|
|
4413
|
-
maxOutputTokens: config.maxOutputTokens
|
|
4414
|
-
};
|
|
4415
|
-
this.retryConfig = config.retry;
|
|
4416
|
-
const google = createGoogleGenerativeAI({
|
|
4417
|
-
apiKey: config.apiKey
|
|
4418
|
-
});
|
|
4419
|
-
this.model = google(config.model);
|
|
4420
|
-
}
|
|
4421
|
-
id;
|
|
4422
|
-
kind = "gemini";
|
|
4423
|
-
targetName;
|
|
4424
|
-
model;
|
|
4425
|
-
defaults;
|
|
4426
|
-
retryConfig;
|
|
4427
|
-
async invoke(request) {
|
|
4428
|
-
return invokeModel({
|
|
4429
|
-
model: this.model,
|
|
4430
|
-
request,
|
|
4431
|
-
defaults: this.defaults,
|
|
4432
|
-
retryConfig: this.retryConfig
|
|
4433
|
-
});
|
|
4434
|
-
}
|
|
4435
|
-
asLanguageModel() {
|
|
4436
|
-
return this.model;
|
|
4437
|
-
}
|
|
4438
|
-
};
|
|
4439
|
-
function buildAzureOptions(config) {
|
|
4440
|
-
const options = {
|
|
4441
|
-
apiKey: config.apiKey,
|
|
4442
|
-
apiVersion: config.version,
|
|
4443
|
-
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
4444
|
-
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
4445
|
-
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
4446
|
-
};
|
|
4447
|
-
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
4448
|
-
if (baseURL) {
|
|
4449
|
-
options.baseURL = baseURL;
|
|
4450
|
-
} else {
|
|
4451
|
-
options.resourceName = config.resourceName;
|
|
4452
|
-
}
|
|
4453
|
-
return options;
|
|
4454
|
-
}
|
|
4455
|
-
function normalizeAzureBaseUrl(resourceName) {
|
|
4456
|
-
const trimmed = resourceName.trim();
|
|
4457
|
-
if (!/^https?:\/\//i.test(trimmed)) {
|
|
4458
|
-
return void 0;
|
|
4459
|
-
}
|
|
4460
|
-
const withoutSlash = trimmed.replace(/\/+$/, "");
|
|
4461
|
-
const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
|
|
4462
|
-
return normalized;
|
|
4463
|
-
}
|
|
4464
|
-
function buildAnthropicProviderOptions(defaults) {
|
|
4465
|
-
if (defaults.thinkingBudget === void 0) {
|
|
4466
|
-
return void 0;
|
|
4467
|
-
}
|
|
4468
|
-
return {
|
|
4469
|
-
anthropic: {
|
|
4470
|
-
thinking: {
|
|
4471
|
-
type: "enabled",
|
|
4472
|
-
budgetTokens: defaults.thinkingBudget
|
|
4473
|
-
}
|
|
4474
|
-
}
|
|
4475
|
-
};
|
|
4476
|
-
}
|
|
4477
|
-
function buildChatPrompt(request) {
|
|
4478
|
-
const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
|
|
4479
|
-
if (provided) {
|
|
4480
|
-
const hasSystemMessage = provided.some((message) => message.role === "system");
|
|
4481
|
-
if (hasSystemMessage) {
|
|
4482
|
-
return provided;
|
|
4483
|
-
}
|
|
4484
|
-
const systemContent2 = resolveSystemContent(request);
|
|
4485
|
-
return [{ role: "system", content: systemContent2 }, ...provided];
|
|
4486
|
-
}
|
|
4487
|
-
const systemContent = resolveSystemContent(request);
|
|
4488
|
-
const userContent = request.question.trim();
|
|
4489
|
-
const prompt = [
|
|
4490
|
-
{ role: "system", content: systemContent },
|
|
4491
|
-
{ role: "user", content: userContent }
|
|
4492
|
-
];
|
|
4493
|
-
return prompt;
|
|
4494
|
-
}
|
|
4495
|
-
function resolveSystemContent(request) {
|
|
4496
|
-
const systemSegments = [];
|
|
4497
|
-
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
4498
|
-
systemSegments.push(request.systemPrompt.trim());
|
|
4499
|
-
} else {
|
|
4500
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
4501
|
-
}
|
|
4502
|
-
return systemSegments.join("\n\n");
|
|
4503
|
-
}
|
|
4504
|
-
function toModelMessages(chatPrompt) {
|
|
4505
|
-
return chatPrompt.map((message) => {
|
|
4506
|
-
if (message.role === "tool" || message.role === "function") {
|
|
4507
|
-
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
4508
|
-
return {
|
|
4509
|
-
role: "assistant",
|
|
4510
|
-
content: `${prefix}${message.content}`
|
|
4511
|
-
};
|
|
4512
|
-
}
|
|
4513
|
-
if (message.role === "assistant" || message.role === "system" || message.role === "user") {
|
|
4514
|
-
return {
|
|
4515
|
-
role: message.role,
|
|
4516
|
-
content: message.content
|
|
4517
|
-
};
|
|
4518
|
-
}
|
|
4519
|
-
return {
|
|
4520
|
-
role: "user",
|
|
4521
|
-
content: message.content
|
|
4522
|
-
};
|
|
4523
|
-
});
|
|
4524
|
-
}
|
|
4525
|
-
function resolveModelSettings(request, defaults) {
|
|
4526
|
-
const temperature = request.temperature ?? defaults.temperature;
|
|
4527
|
-
const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
4528
|
-
return {
|
|
4529
|
-
temperature,
|
|
4530
|
-
maxOutputTokens
|
|
4531
|
-
};
|
|
4532
|
-
}
|
|
4533
|
-
async function invokeModel(options) {
|
|
4534
|
-
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
4535
|
-
const chatPrompt = buildChatPrompt(request);
|
|
4536
|
-
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
4537
|
-
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
4538
|
-
const startMs = Date.now();
|
|
4539
|
-
const result = await withRetry(
|
|
4540
|
-
() => generateText3({
|
|
4541
|
-
model,
|
|
4542
|
-
messages: toModelMessages(chatPrompt),
|
|
4543
|
-
temperature,
|
|
4544
|
-
maxOutputTokens,
|
|
4545
|
-
maxRetries: 0,
|
|
4546
|
-
abortSignal: request.signal,
|
|
4547
|
-
...providerOptions ? { providerOptions } : {}
|
|
4548
|
-
}),
|
|
4549
|
-
retryConfig,
|
|
4550
|
-
request.signal
|
|
4551
|
-
);
|
|
4552
|
-
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
4553
|
-
const durationMs = Date.now() - startMs;
|
|
4554
|
-
return mapResponse(result, { durationMs, startTime, endTime });
|
|
4555
|
-
}
|
|
4556
|
-
function mapResponse(result, timing) {
|
|
4557
|
-
const content = result.text ?? "";
|
|
4558
|
-
const rawUsage = result.totalUsage ?? result.usage;
|
|
4559
|
-
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
4560
|
-
const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? void 0;
|
|
4561
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? {
|
|
4562
|
-
input: rawUsage.inputTokens,
|
|
4563
|
-
output: rawUsage.outputTokens,
|
|
4564
|
-
...reasoning != null ? { reasoning } : {},
|
|
4565
|
-
...cached != null ? { cached } : {}
|
|
4566
|
-
} : void 0;
|
|
4567
|
-
return {
|
|
4568
|
-
raw: result,
|
|
4569
|
-
usage: toJsonObject(rawUsage),
|
|
4570
|
-
output: [{ role: "assistant", content }],
|
|
4571
|
-
tokenUsage,
|
|
4572
|
-
durationMs: timing?.durationMs,
|
|
4573
|
-
startTime: timing?.startTime,
|
|
4574
|
-
endTime: timing?.endTime
|
|
4575
|
-
};
|
|
4576
|
-
}
|
|
4577
|
-
function toJsonObject(value) {
|
|
4578
|
-
if (!value || typeof value !== "object") {
|
|
4579
|
-
return void 0;
|
|
4580
|
-
}
|
|
4581
|
-
try {
|
|
4582
|
-
return JSON.parse(JSON.stringify(value));
|
|
4583
|
-
} catch {
|
|
4584
|
-
return void 0;
|
|
4585
|
-
}
|
|
4586
|
-
}
|
|
4587
|
-
function extractStatus(error) {
|
|
4588
|
-
if (!error || typeof error !== "object") {
|
|
4589
|
-
return void 0;
|
|
4590
|
-
}
|
|
4591
|
-
const candidate = error;
|
|
4592
|
-
const directStatus = candidate.status ?? candidate.statusCode;
|
|
4593
|
-
if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
|
|
4594
|
-
return directStatus;
|
|
4595
|
-
}
|
|
4596
|
-
const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
|
|
4597
|
-
if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
|
|
4598
|
-
return responseStatus;
|
|
4599
|
-
}
|
|
4600
|
-
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
4601
|
-
if (message) {
|
|
4602
|
-
const match = message.match(/HTTP\s+(\d{3})/i);
|
|
4603
|
-
if (match) {
|
|
4604
|
-
const parsed = Number.parseInt(match[1], 10);
|
|
4605
|
-
if (Number.isFinite(parsed)) {
|
|
4606
|
-
return parsed;
|
|
4607
|
-
}
|
|
4608
|
-
}
|
|
4609
|
-
}
|
|
4610
|
-
return void 0;
|
|
4611
|
-
}
|
|
4612
|
-
function isNetworkError(error) {
|
|
4613
|
-
if (!error || typeof error !== "object") {
|
|
4614
|
-
return false;
|
|
4615
|
-
}
|
|
4616
|
-
const candidate = error;
|
|
4617
|
-
if (candidate.name === "AbortError") {
|
|
4618
|
-
return false;
|
|
4619
|
-
}
|
|
4620
|
-
const code = candidate.code;
|
|
4621
|
-
if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
|
|
4622
|
-
return true;
|
|
4623
|
-
}
|
|
4624
|
-
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
4625
|
-
if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
|
|
4626
|
-
return true;
|
|
4627
|
-
}
|
|
4628
|
-
return false;
|
|
4629
|
-
}
|
|
4630
|
-
function isRetryableError(error, retryableStatusCodes) {
|
|
4631
|
-
const status = extractStatus(error);
|
|
4632
|
-
if (status === 401 || status === 403) {
|
|
4633
|
-
return false;
|
|
4634
|
-
}
|
|
4635
|
-
if (typeof status === "number") {
|
|
4636
|
-
return retryableStatusCodes.includes(status);
|
|
4637
|
-
}
|
|
4638
|
-
return isNetworkError(error);
|
|
4639
|
-
}
|
|
4640
|
-
function calculateRetryDelay(attempt, config) {
|
|
4641
|
-
const delay = Math.min(
|
|
4642
|
-
config.maxDelayMs,
|
|
4643
|
-
config.initialDelayMs * config.backoffFactor ** attempt
|
|
4644
|
-
);
|
|
4645
|
-
return delay * (0.75 + Math.random() * 0.5);
|
|
4646
|
-
}
|
|
4647
|
-
async function sleep(ms) {
|
|
4648
|
-
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
4649
|
-
}
|
|
4650
|
-
async function withRetry(fn, retryConfig, signal) {
|
|
4651
|
-
const config = {
|
|
4652
|
-
maxRetries: retryConfig?.maxRetries ?? 3,
|
|
4653
|
-
initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
|
|
4654
|
-
maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
|
|
4655
|
-
backoffFactor: retryConfig?.backoffFactor ?? 2,
|
|
4656
|
-
retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
|
|
4657
|
-
};
|
|
4658
|
-
let lastError;
|
|
4659
|
-
for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
|
|
4660
|
-
if (signal?.aborted) {
|
|
4661
|
-
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
4662
|
-
}
|
|
4663
|
-
try {
|
|
4664
|
-
return await fn();
|
|
4665
|
-
} catch (error) {
|
|
4666
|
-
lastError = error;
|
|
4667
|
-
if (attempt >= config.maxRetries) {
|
|
4668
|
-
break;
|
|
4669
|
-
}
|
|
4670
|
-
if (!isRetryableError(error, config.retryableStatusCodes)) {
|
|
4671
|
-
throw error;
|
|
4672
|
-
}
|
|
4673
|
-
const delay = calculateRetryDelay(attempt, config);
|
|
4674
|
-
await sleep(delay);
|
|
4675
|
-
}
|
|
4676
|
-
}
|
|
4677
|
-
throw lastError;
|
|
4678
|
-
}
|
|
4679
|
-
|
|
4680
4256
|
// src/evaluation/providers/claude-cli.ts
|
|
4681
4257
|
import { spawn } from "node:child_process";
|
|
4682
4258
|
import { randomUUID } from "node:crypto";
|
|
@@ -9054,10 +8630,10 @@ function extractToolCallsFromEvents(events) {
|
|
|
9054
8630
|
}
|
|
9055
8631
|
}
|
|
9056
8632
|
const toolCalls = [];
|
|
9057
|
-
for (const [id, { tool
|
|
8633
|
+
for (const [id, { tool, input }] of starts) {
|
|
9058
8634
|
toolCalls.push(
|
|
9059
8635
|
normalizeToolCall("pi-cli", {
|
|
9060
|
-
tool
|
|
8636
|
+
tool,
|
|
9061
8637
|
input,
|
|
9062
8638
|
id: id.startsWith("anon-") ? void 0 : id,
|
|
9063
8639
|
output: results.get(id)
|
|
@@ -10124,7 +9700,7 @@ import { readFile as readFile5 } from "node:fs/promises";
|
|
|
10124
9700
|
import path20 from "node:path";
|
|
10125
9701
|
|
|
10126
9702
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
10127
|
-
function
|
|
9703
|
+
function sleep(ms) {
|
|
10128
9704
|
return new Promise((resolve) => {
|
|
10129
9705
|
setTimeout(resolve, ms);
|
|
10130
9706
|
});
|
|
@@ -10147,7 +9723,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
10147
9723
|
}
|
|
10148
9724
|
return false;
|
|
10149
9725
|
}
|
|
10150
|
-
await
|
|
9726
|
+
await sleep(pollInterval);
|
|
10151
9727
|
}
|
|
10152
9728
|
} catch (error) {
|
|
10153
9729
|
if (error.code === "ENOENT") {
|
|
@@ -10173,7 +9749,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
10173
9749
|
}
|
|
10174
9750
|
return false;
|
|
10175
9751
|
}
|
|
10176
|
-
await
|
|
9752
|
+
await sleep(pollInterval);
|
|
10177
9753
|
}
|
|
10178
9754
|
}
|
|
10179
9755
|
return false;
|
|
@@ -10202,7 +9778,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
10202
9778
|
}
|
|
10203
9779
|
}
|
|
10204
9780
|
if (pending.size > 0) {
|
|
10205
|
-
await
|
|
9781
|
+
await sleep(pollInterval);
|
|
10206
9782
|
}
|
|
10207
9783
|
}
|
|
10208
9784
|
} catch (error) {
|
|
@@ -10230,7 +9806,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
10230
9806
|
}
|
|
10231
9807
|
return false;
|
|
10232
9808
|
}
|
|
10233
|
-
await
|
|
9809
|
+
await sleep(pollInterval);
|
|
10234
9810
|
}
|
|
10235
9811
|
}
|
|
10236
9812
|
}
|
|
@@ -10326,7 +9902,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
10326
9902
|
label: "open-workspace"
|
|
10327
9903
|
});
|
|
10328
9904
|
await raceSpawnError(workspaceChild);
|
|
10329
|
-
await
|
|
9905
|
+
await sleep(100);
|
|
10330
9906
|
const wakeupChatId = "wakeup";
|
|
10331
9907
|
const chatArgs = [
|
|
10332
9908
|
"-r",
|
|
@@ -10343,7 +9919,7 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
|
|
|
10343
9919
|
console.error(`warning: Workspace readiness timeout after ${timeout}s`);
|
|
10344
9920
|
return false;
|
|
10345
9921
|
}
|
|
10346
|
-
await
|
|
9922
|
+
await sleep(pollInterval * 1e3);
|
|
10347
9923
|
}
|
|
10348
9924
|
return true;
|
|
10349
9925
|
}
|
|
@@ -10371,7 +9947,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
|
|
|
10371
9947
|
`VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
|
|
10372
9948
|
);
|
|
10373
9949
|
}
|
|
10374
|
-
await
|
|
9950
|
+
await sleep(500);
|
|
10375
9951
|
const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-chat" });
|
|
10376
9952
|
await raceSpawnError(child);
|
|
10377
9953
|
}
|
|
@@ -10395,7 +9971,7 @@ async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, c
|
|
|
10395
9971
|
`VS Code workspace '${path22.basename(subagentDir)}' failed to become ready within the timeout. Check that '${vscodeCmd}' can open workspaces.`
|
|
10396
9972
|
);
|
|
10397
9973
|
}
|
|
10398
|
-
await
|
|
9974
|
+
await sleep(500);
|
|
10399
9975
|
const child = spawnVsCode(vscodeCmd, chatArgs, { label: "send-batch-chat" });
|
|
10400
9976
|
await raceSpawnError(child);
|
|
10401
9977
|
}
|
|
@@ -16105,7 +15681,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
16105
15681
|
return { tests: await loadTestsFromAgentSkills(evalFilePath) };
|
|
16106
15682
|
}
|
|
16107
15683
|
if (format === "typescript") {
|
|
16108
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
15684
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-5JMF2N65.js");
|
|
16109
15685
|
return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
16110
15686
|
}
|
|
16111
15687
|
const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
|
|
@@ -16140,7 +15716,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
|
|
|
16140
15716
|
return loadTestsFromAgentSkills(evalFilePath);
|
|
16141
15717
|
}
|
|
16142
15718
|
if (format === "typescript") {
|
|
16143
|
-
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-
|
|
15719
|
+
const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-5JMF2N65.js");
|
|
16144
15720
|
const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
|
|
16145
15721
|
return suite.tests;
|
|
16146
15722
|
}
|
|
@@ -16496,7 +16072,8 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
16496
16072
|
const workspacePath = typeof obj.path === "string" ? obj.path : void 0;
|
|
16497
16073
|
const mode = explicitMode ?? (workspacePath ? "static" : void 0);
|
|
16498
16074
|
const docker = parseDockerWorkspaceConfig(obj.docker);
|
|
16499
|
-
|
|
16075
|
+
const env = parseWorkspaceEnvConfig(obj.env);
|
|
16076
|
+
if (!template && !isolation && !repos && !hooks && !mode && !workspacePath && !docker && !env)
|
|
16500
16077
|
return void 0;
|
|
16501
16078
|
return {
|
|
16502
16079
|
...template !== void 0 && { template },
|
|
@@ -16505,7 +16082,19 @@ function parseWorkspaceConfig(raw, evalFileDir) {
|
|
|
16505
16082
|
...hooks !== void 0 && { hooks },
|
|
16506
16083
|
...mode !== void 0 && { mode },
|
|
16507
16084
|
...workspacePath !== void 0 && { path: workspacePath },
|
|
16508
|
-
...docker !== void 0 && { docker }
|
|
16085
|
+
...docker !== void 0 && { docker },
|
|
16086
|
+
...env !== void 0 && { env }
|
|
16087
|
+
};
|
|
16088
|
+
}
|
|
16089
|
+
function parseWorkspaceEnvConfig(raw) {
|
|
16090
|
+
if (!isJsonObject(raw)) return void 0;
|
|
16091
|
+
const obj = raw;
|
|
16092
|
+
const required_commands = Array.isArray(obj.required_commands) ? obj.required_commands.filter((c) => typeof c === "string") : void 0;
|
|
16093
|
+
const required_python_modules = Array.isArray(obj.required_python_modules) ? obj.required_python_modules.filter((m) => typeof m === "string") : void 0;
|
|
16094
|
+
if (!required_commands?.length && !required_python_modules?.length) return void 0;
|
|
16095
|
+
return {
|
|
16096
|
+
...required_commands?.length && { required_commands },
|
|
16097
|
+
...required_python_modules?.length && { required_python_modules }
|
|
16509
16098
|
};
|
|
16510
16099
|
}
|
|
16511
16100
|
function parseDockerWorkspaceConfig(raw) {
|
|
@@ -16865,7 +16454,7 @@ async function runEvaluation(options) {
|
|
|
16865
16454
|
if (!cliModel) {
|
|
16866
16455
|
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
16867
16456
|
}
|
|
16868
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-
|
|
16457
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-MUIGGIP3.js");
|
|
16869
16458
|
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
16870
16459
|
}
|
|
16871
16460
|
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
@@ -17196,6 +16785,19 @@ async function runEvaluation(options) {
|
|
|
17196
16785
|
await dockerSetup.pullImage();
|
|
17197
16786
|
setupLog("Docker image pull complete");
|
|
17198
16787
|
}
|
|
16788
|
+
if (suiteWorkspace?.env) {
|
|
16789
|
+
try {
|
|
16790
|
+
await runPreflightChecks(suiteWorkspace.env, sharedWorkspacePath ?? void 0, setupLog);
|
|
16791
|
+
setupLog("preflight checks passed");
|
|
16792
|
+
} catch (error) {
|
|
16793
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
16794
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
16795
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
16796
|
+
});
|
|
16797
|
+
}
|
|
16798
|
+
throw new Error(message);
|
|
16799
|
+
}
|
|
16800
|
+
}
|
|
17199
16801
|
const suiteHooksEnabled = hooksEnabled(suiteWorkspace);
|
|
17200
16802
|
const suiteBeforeAllHook = suiteWorkspace?.hooks?.before_all;
|
|
17201
16803
|
if (sharedWorkspacePath && suiteHooksEnabled && hasHookCommand(suiteBeforeAllHook)) {
|
|
@@ -18220,7 +17822,7 @@ async function runEvalCase(options) {
|
|
|
18220
17822
|
lastError = error;
|
|
18221
17823
|
if (attempt + 1 < attemptBudget) {
|
|
18222
17824
|
const delayMs = retryBackoffMs(attempt);
|
|
18223
|
-
await
|
|
17825
|
+
await sleep2(delayMs, signal);
|
|
18224
17826
|
attempt += 1;
|
|
18225
17827
|
continue;
|
|
18226
17828
|
}
|
|
@@ -19425,7 +19027,7 @@ function extractErrorMessage(error) {
|
|
|
19425
19027
|
function retryBackoffMs(attempt) {
|
|
19426
19028
|
return Math.min(2 ** attempt * 1e3, 3e4);
|
|
19427
19029
|
}
|
|
19428
|
-
function
|
|
19030
|
+
function sleep2(ms, signal) {
|
|
19429
19031
|
if (signal?.aborted) return Promise.resolve();
|
|
19430
19032
|
return new Promise((resolve) => {
|
|
19431
19033
|
const timer = setTimeout(resolve, ms);
|
|
@@ -19466,6 +19068,38 @@ function computeWeightedMean(entries) {
|
|
|
19466
19068
|
}
|
|
19467
19069
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
19468
19070
|
}
|
|
19071
|
+
async function runPreflightChecks(env, cwd, log) {
|
|
19072
|
+
const execFileAsync4 = promisify7(execFile3);
|
|
19073
|
+
const missing = [];
|
|
19074
|
+
for (const cmd of env.required_commands ?? []) {
|
|
19075
|
+
log(`preflight: checking command "${cmd}"`);
|
|
19076
|
+
try {
|
|
19077
|
+
if (process.platform === "win32") {
|
|
19078
|
+
await execFileAsync4("where", [cmd], { cwd });
|
|
19079
|
+
} else {
|
|
19080
|
+
await execFileAsync4("sh", ["-c", `command -v ${cmd}`], { cwd });
|
|
19081
|
+
}
|
|
19082
|
+
} catch {
|
|
19083
|
+
missing.push(`command: ${cmd}`);
|
|
19084
|
+
}
|
|
19085
|
+
}
|
|
19086
|
+
for (const mod of env.required_python_modules ?? []) {
|
|
19087
|
+
log(`preflight: checking Python module "${mod}"`);
|
|
19088
|
+
try {
|
|
19089
|
+
await execFileAsync4("python3", ["-c", `import ${mod}`], { cwd });
|
|
19090
|
+
} catch {
|
|
19091
|
+
missing.push(`python module: ${mod}`);
|
|
19092
|
+
}
|
|
19093
|
+
}
|
|
19094
|
+
if (missing.length > 0) {
|
|
19095
|
+
throw new Error(
|
|
19096
|
+
`Preflight checks failed \u2014 missing dependencies:
|
|
19097
|
+
${missing.map((m) => ` \u2022 ${m}`).join("\n")}
|
|
19098
|
+
|
|
19099
|
+
Install the missing dependencies before running this eval.`
|
|
19100
|
+
);
|
|
19101
|
+
}
|
|
19102
|
+
}
|
|
19469
19103
|
|
|
19470
19104
|
// src/evaluation/providers/function-provider.ts
|
|
19471
19105
|
function createFunctionProvider(taskFn) {
|
|
@@ -19954,4 +19588,4 @@ export {
|
|
|
19954
19588
|
loadTestById,
|
|
19955
19589
|
loadEvalCaseById
|
|
19956
19590
|
};
|
|
19957
|
-
//# sourceMappingURL=chunk-
|
|
19591
|
+
//# sourceMappingURL=chunk-F234XBWV.js.map
|