@wix/evalforge-evaluator 0.146.0 → 0.148.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +192 -62
- package/build/index.js.map +3 -3
- package/build/index.mjs +201 -66
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/types.d.ts +15 -3
- package/build/types/run-scenario/agents/opencode/config.d.ts +3 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +8 -0
- package/build/types/run-scenario/agents/simple-agent/execute.d.ts +1 -0
- package/build/types/run-scenario/agents/simple-agent/mcp-tools.d.ts +0 -2
- package/package.json +8 -7
package/build/index.mjs
CHANGED
|
@@ -315,10 +315,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
315
315
|
if (evalRun.agentId) {
|
|
316
316
|
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
317
317
|
}
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
318
|
+
const skills = [];
|
|
319
|
+
const mcps = [];
|
|
320
|
+
const subAgents = [];
|
|
321
|
+
const rules = [];
|
|
322
322
|
if (evalRun.capabilityIds && evalRun.capabilityIds.length > 0) {
|
|
323
323
|
const fetchResults = await Promise.allSettled(
|
|
324
324
|
evalRun.capabilityIds.map((id) => api.getCapability(projectId2, id))
|
|
@@ -605,7 +605,7 @@ function getAdapter(identifier) {
|
|
|
605
605
|
}
|
|
606
606
|
|
|
607
607
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
608
|
-
import { AgentRunCommand } from "@wix/evalforge-types";
|
|
608
|
+
import { AgentRunCommand, ClaudeCodeConfigSchema } from "@wix/evalforge-types";
|
|
609
609
|
|
|
610
610
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
611
611
|
import {
|
|
@@ -1187,10 +1187,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1187
1187
|
let lastAction = "Starting...";
|
|
1188
1188
|
let lastToolName;
|
|
1189
1189
|
let lastFilePath;
|
|
1190
|
-
const maxTurns = options.maxTurns
|
|
1190
|
+
const maxTurns = options.maxTurns || void 0;
|
|
1191
1191
|
let messageCount = 0;
|
|
1192
1192
|
const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
|
|
1193
|
-
const baseAllowedTools = [
|
|
1193
|
+
const baseAllowedTools = options.allowedTools ?? [
|
|
1194
1194
|
"Skill",
|
|
1195
1195
|
"Read",
|
|
1196
1196
|
"Write",
|
|
@@ -1204,13 +1204,17 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1204
1204
|
];
|
|
1205
1205
|
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
1206
1206
|
const queryOptions = {
|
|
1207
|
+
...options.extras ?? {},
|
|
1207
1208
|
env: sdkEnv,
|
|
1208
1209
|
cwd: options.cwd,
|
|
1209
1210
|
settingSources: ["project"],
|
|
1210
1211
|
allowedTools,
|
|
1212
|
+
...options.disallowedTools?.length ? { disallowedTools: options.disallowedTools } : {},
|
|
1211
1213
|
model: options.model || DEFAULT_MODEL,
|
|
1212
1214
|
maxTurns,
|
|
1213
1215
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
1216
|
+
...options.effort ? { effort: options.effort } : {},
|
|
1217
|
+
...options.maxBudgetUsd != null ? { maxBudgetUsd: options.maxBudgetUsd } : {},
|
|
1214
1218
|
// Use 'default' permission mode with custom canUseTool handler
|
|
1215
1219
|
// instead of 'bypassPermissions' which fails on root
|
|
1216
1220
|
permissionMode: "default",
|
|
@@ -1285,7 +1289,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1285
1289
|
traceContext.authToken
|
|
1286
1290
|
);
|
|
1287
1291
|
}
|
|
1288
|
-
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
1292
|
+
const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
1289
1293
|
let timeoutHandle;
|
|
1290
1294
|
let timedOut = false;
|
|
1291
1295
|
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
@@ -2049,7 +2053,7 @@ var ClaudeCodeAdapter = class {
|
|
|
2049
2053
|
skills,
|
|
2050
2054
|
scenario,
|
|
2051
2055
|
cwd,
|
|
2052
|
-
|
|
2056
|
+
config,
|
|
2053
2057
|
aiGatewayUrl,
|
|
2054
2058
|
aiGatewayHeaders,
|
|
2055
2059
|
traceContext,
|
|
@@ -2058,20 +2062,37 @@ var ClaudeCodeAdapter = class {
|
|
|
2058
2062
|
rules,
|
|
2059
2063
|
systemPrompt
|
|
2060
2064
|
} = context;
|
|
2061
|
-
const
|
|
2065
|
+
const typed = config ? ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
2066
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
2067
|
+
const schemaKeys = new Set(Object.keys(ClaudeCodeConfigSchema.shape));
|
|
2068
|
+
const extras = {};
|
|
2069
|
+
if (config) {
|
|
2070
|
+
for (const [key, value] of Object.entries(config)) {
|
|
2071
|
+
if (!schemaKeys.has(key)) extras[key] = value;
|
|
2072
|
+
}
|
|
2073
|
+
}
|
|
2074
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
2075
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
2062
2076
|
const options = {
|
|
2063
2077
|
cwd,
|
|
2064
|
-
model:
|
|
2065
|
-
temperature:
|
|
2066
|
-
maxTokens:
|
|
2067
|
-
maxTurns
|
|
2078
|
+
model: cfg?.model,
|
|
2079
|
+
temperature: cfg?.temperature,
|
|
2080
|
+
maxTokens: cfg?.maxTokens,
|
|
2081
|
+
maxTurns,
|
|
2082
|
+
maxThinkingTokens: cfg?.maxThinkingTokens,
|
|
2083
|
+
allowedTools: cfg?.allowedTools,
|
|
2084
|
+
disallowedTools: cfg?.disallowedTools,
|
|
2085
|
+
effort: cfg?.effort,
|
|
2086
|
+
maxBudgetUsd: cfg?.maxBudgetUsd,
|
|
2087
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
2068
2088
|
aiGatewayUrl,
|
|
2069
2089
|
aiGatewayHeaders,
|
|
2070
2090
|
traceContext,
|
|
2071
2091
|
mcps,
|
|
2072
2092
|
subAgents,
|
|
2073
2093
|
rules,
|
|
2074
|
-
systemPrompt
|
|
2094
|
+
systemPrompt,
|
|
2095
|
+
extras
|
|
2075
2096
|
};
|
|
2076
2097
|
const { result, llmTrace, conversation } = await executeWithClaudeCode(
|
|
2077
2098
|
skills,
|
|
@@ -2098,7 +2119,7 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
2098
2119
|
defaultRegistry.register(claudeCodeAdapter);
|
|
2099
2120
|
|
|
2100
2121
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
2101
|
-
import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
|
|
2122
|
+
import { AgentRunCommand as AgentRunCommand2, OpenCodeConfigSchema as OpenCodeConfigSchema2 } from "@wix/evalforge-types";
|
|
2102
2123
|
|
|
2103
2124
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2104
2125
|
import { spawn } from "child_process";
|
|
@@ -2216,21 +2237,34 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHub
|
|
|
2216
2237
|
import { homedir as homedir2 } from "os";
|
|
2217
2238
|
import {
|
|
2218
2239
|
ClaudeModel as ClaudeModel2,
|
|
2219
|
-
AVAILABLE_OPENAI_MODEL_IDS
|
|
2240
|
+
AVAILABLE_OPENAI_MODEL_IDS,
|
|
2241
|
+
AVAILABLE_GEMINI_MODEL_IDS,
|
|
2242
|
+
OpenCodeConfigSchema
|
|
2220
2243
|
} from "@wix/evalforge-types";
|
|
2221
2244
|
var DEFAULT_MODEL2 = `${ClaudeModel2.CLAUDE_4_5_SONNET_1_0}`;
|
|
2245
|
+
var OPENCODE_MODEL_ALIASES = {
|
|
2246
|
+
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
2247
|
+
"claude-opus-4": "claude-opus-4-0"
|
|
2248
|
+
};
|
|
2222
2249
|
function parseModel(model) {
|
|
2223
2250
|
const slashIndex = model.indexOf("/");
|
|
2224
2251
|
if (slashIndex > 0) {
|
|
2252
|
+
const providerID = model.slice(0, slashIndex);
|
|
2253
|
+
const rawModelID = model.slice(slashIndex + 1);
|
|
2225
2254
|
return {
|
|
2226
|
-
providerID
|
|
2227
|
-
modelID:
|
|
2255
|
+
providerID,
|
|
2256
|
+
modelID: OPENCODE_MODEL_ALIASES[rawModelID] ?? rawModelID
|
|
2228
2257
|
};
|
|
2229
2258
|
}
|
|
2259
|
+
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
2230
2260
|
const isOpenAI = AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2231
2261
|
model
|
|
2232
2262
|
);
|
|
2233
|
-
|
|
2263
|
+
const isGemini = AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
2264
|
+
model
|
|
2265
|
+
);
|
|
2266
|
+
if (isGemini) return { providerID: "google", modelID };
|
|
2267
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID };
|
|
2234
2268
|
}
|
|
2235
2269
|
function toOpenCodeMcpConfig(servers) {
|
|
2236
2270
|
const result = {};
|
|
@@ -2281,8 +2315,9 @@ async function buildOpenCodeEnv(options) {
|
|
|
2281
2315
|
const { providerID, modelID } = parseModel(modelStr);
|
|
2282
2316
|
const provider = {};
|
|
2283
2317
|
if (options.aiGatewayUrl) {
|
|
2318
|
+
const proxyPath = providerID === "google" ? "gemini" : providerID;
|
|
2284
2319
|
const providerOptions = {
|
|
2285
|
-
baseURL: `${options.aiGatewayUrl}/proxy/${
|
|
2320
|
+
baseURL: `${options.aiGatewayUrl}/proxy/${proxyPath}`,
|
|
2286
2321
|
apiKey: "sk-placeholder-auth-handled-by-gateway"
|
|
2287
2322
|
};
|
|
2288
2323
|
if (options.aiGatewayHeaders) {
|
|
@@ -2313,20 +2348,26 @@ async function buildOpenCodeEnv(options) {
|
|
|
2313
2348
|
if (options.temperature != null) {
|
|
2314
2349
|
agentOverrides.temperature = options.temperature;
|
|
2315
2350
|
}
|
|
2316
|
-
if (options.
|
|
2351
|
+
if (options.topP != null) {
|
|
2352
|
+
agentOverrides.top_p = options.topP;
|
|
2353
|
+
}
|
|
2354
|
+
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
2317
2355
|
agentOverrides.maxSteps = options.maxTurns;
|
|
2318
2356
|
}
|
|
2357
|
+
const parsed = options.config ? OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
2358
|
+
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
2359
|
+
const defaultPermission = {
|
|
2360
|
+
"*": "allow"
|
|
2361
|
+
};
|
|
2362
|
+
const permission = {
|
|
2363
|
+
...defaultPermission,
|
|
2364
|
+
...configPermission
|
|
2365
|
+
};
|
|
2319
2366
|
const config = {
|
|
2320
2367
|
model: `${providerID}/${modelID}`,
|
|
2321
2368
|
provider,
|
|
2322
2369
|
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2323
|
-
permission
|
|
2324
|
-
edit: "allow",
|
|
2325
|
-
bash: "allow",
|
|
2326
|
-
webfetch: "allow",
|
|
2327
|
-
doom_loop: "allow",
|
|
2328
|
-
external_directory: "allow"
|
|
2329
|
-
},
|
|
2370
|
+
permission,
|
|
2330
2371
|
...mcp ? { mcp } : {}
|
|
2331
2372
|
};
|
|
2332
2373
|
const env = {
|
|
@@ -3060,16 +3101,18 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3060
3101
|
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
3061
3102
|
model: options.model
|
|
3062
3103
|
});
|
|
3063
|
-
const maxTurns = options.maxTurns
|
|
3064
|
-
const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
|
|
3104
|
+
const maxTurns = options.maxTurns || void 0;
|
|
3105
|
+
const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
3065
3106
|
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3066
3107
|
model: options.model,
|
|
3067
3108
|
temperature: options.temperature,
|
|
3109
|
+
topP: options.topP,
|
|
3068
3110
|
maxTurns,
|
|
3069
3111
|
aiGatewayUrl: options.aiGatewayUrl,
|
|
3070
3112
|
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
3071
3113
|
mcps: options.mcps,
|
|
3072
|
-
cwd: options.cwd
|
|
3114
|
+
cwd: options.cwd,
|
|
3115
|
+
config: options.config
|
|
3073
3116
|
});
|
|
3074
3117
|
const startTime = /* @__PURE__ */ new Date();
|
|
3075
3118
|
const traceContext = options.traceContext;
|
|
@@ -3097,13 +3140,13 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3097
3140
|
traceContext.authToken
|
|
3098
3141
|
);
|
|
3099
3142
|
}
|
|
3143
|
+
const variant = options.thinkingVariant ?? "high";
|
|
3144
|
+
const thinkingArgs = variant === "none" ? [] : ["--thinking", "--variant", variant];
|
|
3100
3145
|
const baseArgs = [
|
|
3101
3146
|
"run",
|
|
3102
3147
|
"--format",
|
|
3103
3148
|
"json",
|
|
3104
|
-
|
|
3105
|
-
"--variant",
|
|
3106
|
-
"high",
|
|
3149
|
+
...thinkingArgs,
|
|
3107
3150
|
"--model",
|
|
3108
3151
|
`${providerID}/${modelID}`,
|
|
3109
3152
|
"--dir",
|
|
@@ -3294,7 +3337,7 @@ var OpenCodeAdapter = class {
|
|
|
3294
3337
|
skills,
|
|
3295
3338
|
scenario,
|
|
3296
3339
|
cwd,
|
|
3297
|
-
|
|
3340
|
+
config,
|
|
3298
3341
|
aiGatewayUrl,
|
|
3299
3342
|
aiGatewayHeaders,
|
|
3300
3343
|
traceContext,
|
|
@@ -3303,18 +3346,26 @@ var OpenCodeAdapter = class {
|
|
|
3303
3346
|
rules,
|
|
3304
3347
|
systemPrompt
|
|
3305
3348
|
} = context;
|
|
3349
|
+
const typed = config ? OpenCodeConfigSchema2.passthrough().safeParse(config) : void 0;
|
|
3350
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3351
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
3352
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
3306
3353
|
const options = {
|
|
3307
3354
|
cwd,
|
|
3308
|
-
model:
|
|
3309
|
-
temperature:
|
|
3310
|
-
|
|
3355
|
+
model: cfg?.model,
|
|
3356
|
+
temperature: cfg?.temperature,
|
|
3357
|
+
topP: cfg?.topP,
|
|
3358
|
+
maxTurns,
|
|
3359
|
+
thinkingVariant: cfg?.thinkingVariant,
|
|
3360
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
3311
3361
|
aiGatewayUrl,
|
|
3312
3362
|
aiGatewayHeaders,
|
|
3313
3363
|
traceContext,
|
|
3314
3364
|
mcps,
|
|
3315
3365
|
subAgents,
|
|
3316
3366
|
rules,
|
|
3317
|
-
systemPrompt
|
|
3367
|
+
systemPrompt,
|
|
3368
|
+
config
|
|
3318
3369
|
};
|
|
3319
3370
|
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3320
3371
|
skills,
|
|
@@ -3346,12 +3397,16 @@ import {
|
|
|
3346
3397
|
stepCountIs
|
|
3347
3398
|
} from "ai";
|
|
3348
3399
|
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
3400
|
+
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
|
3349
3401
|
import { createOpenAI } from "@ai-sdk/openai";
|
|
3350
3402
|
import {
|
|
3351
3403
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
3404
|
+
AVAILABLE_GEMINI_MODEL_IDS as AVAILABLE_GEMINI_MODEL_IDS2,
|
|
3405
|
+
GEMINI_THINKING_MODEL_IDS,
|
|
3352
3406
|
OPENAI_RESPONSES_MODEL_IDS,
|
|
3353
3407
|
LLMStepType as LLMStepType3,
|
|
3354
|
-
LiveTraceEventType as LiveTraceEventType3
|
|
3408
|
+
LiveTraceEventType as LiveTraceEventType3,
|
|
3409
|
+
SimpleAgentConfigSchema
|
|
3355
3410
|
} from "@wix/evalforge-types";
|
|
3356
3411
|
import { randomUUID as randomUUID3 } from "crypto";
|
|
3357
3412
|
|
|
@@ -3373,8 +3428,9 @@ async function buildMcpTools(mcps, cwd) {
|
|
|
3373
3428
|
const client = await createMCPClient({ transport });
|
|
3374
3429
|
clients.push(client);
|
|
3375
3430
|
const tools = await client.tools();
|
|
3431
|
+
const safePrefix = serverName.replace(/[^a-zA-Z0-9]/g, "_");
|
|
3376
3432
|
for (const [toolName, tool] of Object.entries(tools)) {
|
|
3377
|
-
allTools[`${
|
|
3433
|
+
allTools[`${safePrefix}_${toolName}`] = tool;
|
|
3378
3434
|
}
|
|
3379
3435
|
}
|
|
3380
3436
|
}
|
|
@@ -3450,6 +3506,7 @@ function extractErrorText(content) {
|
|
|
3450
3506
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
3451
3507
|
import { normalizeModelId } from "@wix/evalforge-types";
|
|
3452
3508
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
3509
|
+
var PROVIDER_GEMINI = "gemini";
|
|
3453
3510
|
var MODEL_PRICING = {
|
|
3454
3511
|
// Anthropic — Claude 4.6
|
|
3455
3512
|
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
@@ -3476,7 +3533,18 @@ var MODEL_PRICING = {
|
|
|
3476
3533
|
o3: { input: 2, output: 8 },
|
|
3477
3534
|
"o4-mini": { input: 1.1, output: 4.4 },
|
|
3478
3535
|
"o3-mini": { input: 1.1, output: 4.4 },
|
|
3479
|
-
o1: { input: 15, output: 60 }
|
|
3536
|
+
o1: { input: 15, output: 60 },
|
|
3537
|
+
// Google Gemini 2.0
|
|
3538
|
+
"gemini-2.0-flash": { input: 0.1, output: 0.4 },
|
|
3539
|
+
"gemini-2.0-flash-lite": { input: 0.075, output: 0.3 },
|
|
3540
|
+
// Google Gemini 2.5
|
|
3541
|
+
"gemini-2.5-pro": { input: 1.25, output: 10 },
|
|
3542
|
+
"gemini-2.5-flash": { input: 0.15, output: 0.6 },
|
|
3543
|
+
"gemini-2.5-flash-lite": { input: 0.075, output: 0.3 },
|
|
3544
|
+
// Google Gemini 3.x — standard pricing up to 200K context tokens
|
|
3545
|
+
"gemini-3-pro-preview": { input: 2, output: 12 },
|
|
3546
|
+
"gemini-3-flash-preview": { input: 0.5, output: 3 },
|
|
3547
|
+
"gemini-3.1-pro-preview": { input: 2, output: 12 }
|
|
3480
3548
|
};
|
|
3481
3549
|
function extractGatewayCost(step, provider) {
|
|
3482
3550
|
try {
|
|
@@ -3487,6 +3555,15 @@ function extractGatewayCost(step, provider) {
|
|
|
3487
3555
|
const cost2 = usage?.total_cost_usd;
|
|
3488
3556
|
return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
|
|
3489
3557
|
}
|
|
3558
|
+
if (provider === PROVIDER_GEMINI) {
|
|
3559
|
+
const meta = step.providerMetadata;
|
|
3560
|
+
const google = meta?.google;
|
|
3561
|
+
const cost2 = google?.total_cost_usd;
|
|
3562
|
+
if (typeof cost2 === "number" && cost2 > 0) return cost2;
|
|
3563
|
+
const body2 = step.response?.body;
|
|
3564
|
+
const bodyCost = body2?.total_cost_usd;
|
|
3565
|
+
return typeof bodyCost === "number" && bodyCost > 0 ? bodyCost : void 0;
|
|
3566
|
+
}
|
|
3490
3567
|
const body = step.response?.body;
|
|
3491
3568
|
const cost = body?.total_cost_usd;
|
|
3492
3569
|
return typeof cost === "number" && cost > 0 ? cost : void 0;
|
|
@@ -3564,10 +3641,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3564
3641
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3565
3642
|
var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
3566
3643
|
var PROVIDER_OPENAI = "openai";
|
|
3644
|
+
var PROVIDER_GEMINI2 = "gemini";
|
|
3567
3645
|
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
3568
3646
|
function createModel(modelId, baseUrl, headers) {
|
|
3569
|
-
|
|
3570
|
-
if (isClaudeModel) {
|
|
3647
|
+
if (isClaudeModelId(modelId)) {
|
|
3571
3648
|
const anthropic = createAnthropic({
|
|
3572
3649
|
baseURL: `${baseUrl}/proxy/anthropic`,
|
|
3573
3650
|
apiKey: "proxy-auth",
|
|
@@ -3575,6 +3652,14 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
3575
3652
|
});
|
|
3576
3653
|
return anthropic(modelId);
|
|
3577
3654
|
}
|
|
3655
|
+
if (isGeminiModelId(modelId)) {
|
|
3656
|
+
const google = createGoogleGenerativeAI({
|
|
3657
|
+
baseURL: `${baseUrl}/proxy/gemini`,
|
|
3658
|
+
apiKey: "proxy-auth",
|
|
3659
|
+
headers
|
|
3660
|
+
});
|
|
3661
|
+
return google(modelId);
|
|
3662
|
+
}
|
|
3578
3663
|
const openai = createOpenAI({
|
|
3579
3664
|
baseURL: `${baseUrl}/proxy/openai`,
|
|
3580
3665
|
apiKey: "proxy-auth",
|
|
@@ -3592,6 +3677,11 @@ function isClaudeModelId(modelId) {
|
|
|
3592
3677
|
(id) => modelId === id || modelId.startsWith(id)
|
|
3593
3678
|
);
|
|
3594
3679
|
}
|
|
3680
|
+
function isGeminiModelId(modelId) {
|
|
3681
|
+
return AVAILABLE_GEMINI_MODEL_IDS2.some(
|
|
3682
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3683
|
+
);
|
|
3684
|
+
}
|
|
3595
3685
|
function extractSkillContent(files) {
|
|
3596
3686
|
if (!files || files.length === 0) return void 0;
|
|
3597
3687
|
const skillMd = files.find((f) => f.path === "SKILL.md");
|
|
@@ -3602,20 +3692,30 @@ async function executeWithAiSdk(context) {
|
|
|
3602
3692
|
const {
|
|
3603
3693
|
scenario,
|
|
3604
3694
|
cwd,
|
|
3605
|
-
|
|
3695
|
+
config,
|
|
3606
3696
|
aiGatewayUrl,
|
|
3607
3697
|
aiGatewayHeaders = {},
|
|
3608
3698
|
mcps,
|
|
3609
3699
|
traceContext
|
|
3610
3700
|
} = context;
|
|
3701
|
+
const typed = config ? SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
3702
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3703
|
+
const schemaKeys = new Set(Object.keys(SimpleAgentConfigSchema.shape));
|
|
3704
|
+
const configExtras = {};
|
|
3705
|
+
if (config) {
|
|
3706
|
+
for (const [key, value] of Object.entries(config)) {
|
|
3707
|
+
if (!schemaKeys.has(key)) configExtras[key] = value;
|
|
3708
|
+
}
|
|
3709
|
+
}
|
|
3611
3710
|
if (!aiGatewayUrl) {
|
|
3612
3711
|
throw new Error("Simple Agent requires aiGatewayUrl");
|
|
3613
3712
|
}
|
|
3614
|
-
if (!
|
|
3615
|
-
throw new Error("Simple Agent requires a model in
|
|
3713
|
+
if (!cfg?.model) {
|
|
3714
|
+
throw new Error("Simple Agent requires a model in config");
|
|
3616
3715
|
}
|
|
3617
|
-
const
|
|
3618
|
-
const
|
|
3716
|
+
const modelId = cfg.model;
|
|
3717
|
+
const model = createModel(modelId, aiGatewayUrl, aiGatewayHeaders);
|
|
3718
|
+
const provider = isClaudeModelId(modelId) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelId) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
|
|
3619
3719
|
const systemPrompt = composeSystemPrompt(context);
|
|
3620
3720
|
const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
|
|
3621
3721
|
const startTime = Date.now();
|
|
@@ -3625,13 +3725,17 @@ async function executeWithAiSdk(context) {
|
|
|
3625
3725
|
try {
|
|
3626
3726
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
3627
3727
|
const isResponsesAPI = [...OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3628
|
-
(id) =>
|
|
3728
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3629
3729
|
);
|
|
3630
|
-
const
|
|
3631
|
-
const
|
|
3730
|
+
const isGemini = provider === PROVIDER_GEMINI2;
|
|
3731
|
+
const isGeminiThinking = isGemini && GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
3732
|
+
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
3733
|
+
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
3734
|
+
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
3735
|
+
const computedProviderOpts = {
|
|
3632
3736
|
...isAnthropic && {
|
|
3633
3737
|
anthropic: {
|
|
3634
|
-
thinking: { type: "enabled", budgetTokens:
|
|
3738
|
+
thinking: { type: "enabled", budgetTokens: thinkingBudgetTokens }
|
|
3635
3739
|
}
|
|
3636
3740
|
},
|
|
3637
3741
|
...isResponsesAPI && {
|
|
@@ -3639,15 +3743,30 @@ async function executeWithAiSdk(context) {
|
|
|
3639
3743
|
// Prevent the SDK from sending item_reference inputs that the proxy can't forward
|
|
3640
3744
|
store: false,
|
|
3641
3745
|
forceReasoning: true,
|
|
3642
|
-
reasoningEffort
|
|
3746
|
+
reasoningEffort,
|
|
3643
3747
|
reasoningSummary: "detailed"
|
|
3644
3748
|
}
|
|
3749
|
+
},
|
|
3750
|
+
...isGeminiThinking && {
|
|
3751
|
+
google: {
|
|
3752
|
+
thinkingConfig: {
|
|
3753
|
+
includeThoughts: true,
|
|
3754
|
+
thinkingBudget: 1e4
|
|
3755
|
+
}
|
|
3756
|
+
}
|
|
3645
3757
|
}
|
|
3646
3758
|
};
|
|
3759
|
+
const { providerOptions: extraProviderOptions, ...topLevelExtras } = configExtras;
|
|
3760
|
+
const mergedProviderOptions = {
|
|
3761
|
+
...extraProviderOptions && typeof extraProviderOptions === "object" ? extraProviderOptions : {},
|
|
3762
|
+
...computedProviderOpts
|
|
3763
|
+
};
|
|
3647
3764
|
const stepTimestamps = [];
|
|
3765
|
+
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3648
3766
|
const { triggerPromptImages } = context;
|
|
3649
3767
|
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3650
|
-
const
|
|
3768
|
+
const result = await generateText({
|
|
3769
|
+
...topLevelExtras,
|
|
3651
3770
|
model,
|
|
3652
3771
|
system: systemPrompt,
|
|
3653
3772
|
...hasImages ? {
|
|
@@ -3665,11 +3784,16 @@ async function executeWithAiSdk(context) {
|
|
|
3665
3784
|
}
|
|
3666
3785
|
]
|
|
3667
3786
|
} : { prompt: scenario.triggerPrompt },
|
|
3668
|
-
temperature: supportsThinking ? void 0 :
|
|
3669
|
-
|
|
3787
|
+
temperature: supportsThinking ? void 0 : cfg.temperature,
|
|
3788
|
+
topP: supportsThinking ? void 0 : cfg.topP,
|
|
3789
|
+
frequencyPenalty: cfg.frequencyPenalty,
|
|
3790
|
+
presencePenalty: cfg.presencePenalty,
|
|
3791
|
+
seed: cfg.seed,
|
|
3792
|
+
stopSequences: cfg.stopSequences,
|
|
3793
|
+
maxOutputTokens: cfg.maxTokens,
|
|
3670
3794
|
tools: mcpTools,
|
|
3671
|
-
|
|
3672
|
-
providerOptions:
|
|
3795
|
+
...mcpTools && effectiveMaxTurns != null ? { stopWhen: stepCountIs(effectiveMaxTurns) } : !mcpTools ? { stopWhen: stepCountIs(1) } : {},
|
|
3796
|
+
providerOptions: mergedProviderOptions,
|
|
3673
3797
|
onStepFinish: (step) => {
|
|
3674
3798
|
stepTimestamps.push(Date.now());
|
|
3675
3799
|
if (traceContext) {
|
|
@@ -3697,8 +3821,7 @@ async function executeWithAiSdk(context) {
|
|
|
3697
3821
|
);
|
|
3698
3822
|
}
|
|
3699
3823
|
}
|
|
3700
|
-
};
|
|
3701
|
-
const result = await generateText(generateTextParams);
|
|
3824
|
+
});
|
|
3702
3825
|
const durationMs = Date.now() - startTime;
|
|
3703
3826
|
const usage = {
|
|
3704
3827
|
inputTokens: result.usage.inputTokens ?? 0,
|
|
@@ -3709,7 +3832,7 @@ async function executeWithAiSdk(context) {
|
|
|
3709
3832
|
result.steps,
|
|
3710
3833
|
durationMs,
|
|
3711
3834
|
usage,
|
|
3712
|
-
|
|
3835
|
+
modelId,
|
|
3713
3836
|
provider,
|
|
3714
3837
|
startTime,
|
|
3715
3838
|
stepTimestamps
|
|
@@ -4622,11 +4745,12 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4622
4745
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4623
4746
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4624
4747
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4748
|
+
const agentConfig = agent?.config;
|
|
4625
4749
|
const executionContext = {
|
|
4626
4750
|
skills: evalData.skills,
|
|
4627
4751
|
scenario,
|
|
4628
4752
|
cwd: workDir || process.cwd(),
|
|
4629
|
-
|
|
4753
|
+
config: agentConfig,
|
|
4630
4754
|
aiGatewayUrl: config.aiGatewayUrl,
|
|
4631
4755
|
aiGatewayHeaders: config.aiGatewayHeaders,
|
|
4632
4756
|
traceContext: {
|
|
@@ -4661,7 +4785,18 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4661
4785
|
infrastructurePaths
|
|
4662
4786
|
);
|
|
4663
4787
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4664
|
-
const
|
|
4788
|
+
const snapshotModelConfig = agentConfig?.model ? {
|
|
4789
|
+
model: agentConfig.model,
|
|
4790
|
+
...agentConfig.temperature != null && {
|
|
4791
|
+
temperature: agentConfig.temperature
|
|
4792
|
+
},
|
|
4793
|
+
...agentConfig.maxTokens != null && {
|
|
4794
|
+
maxTokens: agentConfig.maxTokens
|
|
4795
|
+
},
|
|
4796
|
+
...agentConfig.maxTurns != null && {
|
|
4797
|
+
maxTurns: agentConfig.maxTurns
|
|
4798
|
+
}
|
|
4799
|
+
} : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
4665
4800
|
return {
|
|
4666
4801
|
id: randomUUID4(),
|
|
4667
4802
|
targetId,
|
|
@@ -4669,7 +4804,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4669
4804
|
scenarioId: scenario.id,
|
|
4670
4805
|
scenarioName: scenario.name,
|
|
4671
4806
|
triggerPrompt: scenario.triggerPrompt,
|
|
4672
|
-
modelConfig:
|
|
4807
|
+
modelConfig: snapshotModelConfig,
|
|
4673
4808
|
duration: durationMs,
|
|
4674
4809
|
outputText,
|
|
4675
4810
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|