@wix/evalforge-evaluator 0.146.0 → 0.148.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -315,10 +315,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
315
315
  if (evalRun.agentId) {
316
316
  agent = await api.getAgent(projectId2, evalRun.agentId);
317
317
  }
318
- let skills = [];
319
- let mcps = [];
320
- let subAgents = [];
321
- let rules = [];
318
+ const skills = [];
319
+ const mcps = [];
320
+ const subAgents = [];
321
+ const rules = [];
322
322
  if (evalRun.capabilityIds && evalRun.capabilityIds.length > 0) {
323
323
  const fetchResults = await Promise.allSettled(
324
324
  evalRun.capabilityIds.map((id) => api.getCapability(projectId2, id))
@@ -605,7 +605,7 @@ function getAdapter(identifier) {
605
605
  }
606
606
 
607
607
  // src/run-scenario/agents/claude-code/claude-code-adapter.ts
608
- import { AgentRunCommand } from "@wix/evalforge-types";
608
+ import { AgentRunCommand, ClaudeCodeConfigSchema } from "@wix/evalforge-types";
609
609
 
610
610
  // src/run-scenario/agents/claude-code/execute.ts
611
611
  import {
@@ -1187,10 +1187,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
1187
1187
  let lastAction = "Starting...";
1188
1188
  let lastToolName;
1189
1189
  let lastFilePath;
1190
- const maxTurns = options.maxTurns ?? 10;
1190
+ const maxTurns = options.maxTurns || void 0;
1191
1191
  let messageCount = 0;
1192
1192
  const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
1193
- const baseAllowedTools = [
1193
+ const baseAllowedTools = options.allowedTools ?? [
1194
1194
  "Skill",
1195
1195
  "Read",
1196
1196
  "Write",
@@ -1204,13 +1204,17 @@ async function executeWithClaudeCode(skills, scenario, options) {
1204
1204
  ];
1205
1205
  const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
1206
1206
  const queryOptions = {
1207
+ ...options.extras ?? {},
1207
1208
  env: sdkEnv,
1208
1209
  cwd: options.cwd,
1209
1210
  settingSources: ["project"],
1210
1211
  allowedTools,
1212
+ ...options.disallowedTools?.length ? { disallowedTools: options.disallowedTools } : {},
1211
1213
  model: options.model || DEFAULT_MODEL,
1212
1214
  maxTurns,
1213
1215
  maxThinkingTokens: options.maxThinkingTokens,
1216
+ ...options.effort ? { effort: options.effort } : {},
1217
+ ...options.maxBudgetUsd != null ? { maxBudgetUsd: options.maxBudgetUsd } : {},
1214
1218
  // Use 'default' permission mode with custom canUseTool handler
1215
1219
  // instead of 'bypassPermissions' which fails on root
1216
1220
  permissionMode: "default",
@@ -1285,7 +1289,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
1285
1289
  traceContext.authToken
1286
1290
  );
1287
1291
  }
1288
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
1292
+ const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
1289
1293
  let timeoutHandle;
1290
1294
  let timedOut = false;
1291
1295
  const HEARTBEAT_INTERVAL_MS = 1e4;
@@ -2049,7 +2053,7 @@ var ClaudeCodeAdapter = class {
2049
2053
  skills,
2050
2054
  scenario,
2051
2055
  cwd,
2052
- modelConfig,
2056
+ config,
2053
2057
  aiGatewayUrl,
2054
2058
  aiGatewayHeaders,
2055
2059
  traceContext,
@@ -2058,20 +2062,37 @@ var ClaudeCodeAdapter = class {
2058
2062
  rules,
2059
2063
  systemPrompt
2060
2064
  } = context;
2061
- const modelForSdk = modelConfig?.model;
2065
+ const typed = config ? ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
2066
+ const cfg = typed?.success ? typed.data : void 0;
2067
+ const schemaKeys = new Set(Object.keys(ClaudeCodeConfigSchema.shape));
2068
+ const extras = {};
2069
+ if (config) {
2070
+ for (const [key, value] of Object.entries(config)) {
2071
+ if (!schemaKeys.has(key)) extras[key] = value;
2072
+ }
2073
+ }
2074
+ const rawMaxTurns = cfg?.maxTurns;
2075
+ const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
2062
2076
  const options = {
2063
2077
  cwd,
2064
- model: modelForSdk,
2065
- temperature: modelConfig?.temperature,
2066
- maxTokens: modelConfig?.maxTokens,
2067
- maxTurns: modelConfig?.maxTurns,
2078
+ model: cfg?.model,
2079
+ temperature: cfg?.temperature,
2080
+ maxTokens: cfg?.maxTokens,
2081
+ maxTurns,
2082
+ maxThinkingTokens: cfg?.maxThinkingTokens,
2083
+ allowedTools: cfg?.allowedTools,
2084
+ disallowedTools: cfg?.disallowedTools,
2085
+ effort: cfg?.effort,
2086
+ maxBudgetUsd: cfg?.maxBudgetUsd,
2087
+ maxDurationMs: cfg?.maxDurationMs,
2068
2088
  aiGatewayUrl,
2069
2089
  aiGatewayHeaders,
2070
2090
  traceContext,
2071
2091
  mcps,
2072
2092
  subAgents,
2073
2093
  rules,
2074
- systemPrompt
2094
+ systemPrompt,
2095
+ extras
2075
2096
  };
2076
2097
  const { result, llmTrace, conversation } = await executeWithClaudeCode(
2077
2098
  skills,
@@ -2098,7 +2119,7 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
2098
2119
  defaultRegistry.register(claudeCodeAdapter);
2099
2120
 
2100
2121
  // src/run-scenario/agents/opencode/opencode-adapter.ts
2101
- import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
2122
+ import { AgentRunCommand as AgentRunCommand2, OpenCodeConfigSchema as OpenCodeConfigSchema2 } from "@wix/evalforge-types";
2102
2123
 
2103
2124
  // src/run-scenario/agents/opencode/execute.ts
2104
2125
  import { spawn } from "child_process";
@@ -2216,21 +2237,34 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHub
2216
2237
  import { homedir as homedir2 } from "os";
2217
2238
  import {
2218
2239
  ClaudeModel as ClaudeModel2,
2219
- AVAILABLE_OPENAI_MODEL_IDS
2240
+ AVAILABLE_OPENAI_MODEL_IDS,
2241
+ AVAILABLE_GEMINI_MODEL_IDS,
2242
+ OpenCodeConfigSchema
2220
2243
  } from "@wix/evalforge-types";
2221
2244
  var DEFAULT_MODEL2 = `${ClaudeModel2.CLAUDE_4_5_SONNET_1_0}`;
2245
+ var OPENCODE_MODEL_ALIASES = {
2246
+ "claude-sonnet-4": "claude-sonnet-4-0",
2247
+ "claude-opus-4": "claude-opus-4-0"
2248
+ };
2222
2249
  function parseModel(model) {
2223
2250
  const slashIndex = model.indexOf("/");
2224
2251
  if (slashIndex > 0) {
2252
+ const providerID = model.slice(0, slashIndex);
2253
+ const rawModelID = model.slice(slashIndex + 1);
2225
2254
  return {
2226
- providerID: model.slice(0, slashIndex),
2227
- modelID: model.slice(slashIndex + 1)
2255
+ providerID,
2256
+ modelID: OPENCODE_MODEL_ALIASES[rawModelID] ?? rawModelID
2228
2257
  };
2229
2258
  }
2259
+ const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
2230
2260
  const isOpenAI = AVAILABLE_OPENAI_MODEL_IDS.includes(
2231
2261
  model
2232
2262
  );
2233
- return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
2263
+ const isGemini = AVAILABLE_GEMINI_MODEL_IDS.includes(
2264
+ model
2265
+ );
2266
+ if (isGemini) return { providerID: "google", modelID };
2267
+ return { providerID: isOpenAI ? "openai" : "anthropic", modelID };
2234
2268
  }
2235
2269
  function toOpenCodeMcpConfig(servers) {
2236
2270
  const result = {};
@@ -2281,8 +2315,9 @@ async function buildOpenCodeEnv(options) {
2281
2315
  const { providerID, modelID } = parseModel(modelStr);
2282
2316
  const provider = {};
2283
2317
  if (options.aiGatewayUrl) {
2318
+ const proxyPath = providerID === "google" ? "gemini" : providerID;
2284
2319
  const providerOptions = {
2285
- baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
2320
+ baseURL: `${options.aiGatewayUrl}/proxy/${proxyPath}`,
2286
2321
  apiKey: "sk-placeholder-auth-handled-by-gateway"
2287
2322
  };
2288
2323
  if (options.aiGatewayHeaders) {
@@ -2313,20 +2348,26 @@ async function buildOpenCodeEnv(options) {
2313
2348
  if (options.temperature != null) {
2314
2349
  agentOverrides.temperature = options.temperature;
2315
2350
  }
2316
- if (options.maxTurns != null) {
2351
+ if (options.topP != null) {
2352
+ agentOverrides.top_p = options.topP;
2353
+ }
2354
+ if (options.maxTurns != null && options.maxTurns > 0) {
2317
2355
  agentOverrides.maxSteps = options.maxTurns;
2318
2356
  }
2357
+ const parsed = options.config ? OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
2358
+ const configPermission = parsed?.success ? parsed.data.permission : void 0;
2359
+ const defaultPermission = {
2360
+ "*": "allow"
2361
+ };
2362
+ const permission = {
2363
+ ...defaultPermission,
2364
+ ...configPermission
2365
+ };
2319
2366
  const config = {
2320
2367
  model: `${providerID}/${modelID}`,
2321
2368
  provider,
2322
2369
  ...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
2323
- permission: {
2324
- edit: "allow",
2325
- bash: "allow",
2326
- webfetch: "allow",
2327
- doom_loop: "allow",
2328
- external_directory: "allow"
2329
- },
2370
+ permission,
2330
2371
  ...mcp ? { mcp } : {}
2331
2372
  };
2332
2373
  const env = {
@@ -3060,16 +3101,18 @@ async function executeWithOpenCode(skills, scenario, options) {
3060
3101
  hasAiGatewayHeaders: !!options.aiGatewayHeaders,
3061
3102
  model: options.model
3062
3103
  });
3063
- const maxTurns = options.maxTurns ?? 10;
3064
- const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
3104
+ const maxTurns = options.maxTurns || void 0;
3105
+ const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
3065
3106
  const { env, providerID, modelID } = await buildOpenCodeEnv({
3066
3107
  model: options.model,
3067
3108
  temperature: options.temperature,
3109
+ topP: options.topP,
3068
3110
  maxTurns,
3069
3111
  aiGatewayUrl: options.aiGatewayUrl,
3070
3112
  aiGatewayHeaders: options.aiGatewayHeaders,
3071
3113
  mcps: options.mcps,
3072
- cwd: options.cwd
3114
+ cwd: options.cwd,
3115
+ config: options.config
3073
3116
  });
3074
3117
  const startTime = /* @__PURE__ */ new Date();
3075
3118
  const traceContext = options.traceContext;
@@ -3097,13 +3140,13 @@ async function executeWithOpenCode(skills, scenario, options) {
3097
3140
  traceContext.authToken
3098
3141
  );
3099
3142
  }
3143
+ const variant = options.thinkingVariant ?? "high";
3144
+ const thinkingArgs = variant === "none" ? [] : ["--thinking", "--variant", variant];
3100
3145
  const baseArgs = [
3101
3146
  "run",
3102
3147
  "--format",
3103
3148
  "json",
3104
- "--thinking",
3105
- "--variant",
3106
- "high",
3149
+ ...thinkingArgs,
3107
3150
  "--model",
3108
3151
  `${providerID}/${modelID}`,
3109
3152
  "--dir",
@@ -3294,7 +3337,7 @@ var OpenCodeAdapter = class {
3294
3337
  skills,
3295
3338
  scenario,
3296
3339
  cwd,
3297
- modelConfig,
3340
+ config,
3298
3341
  aiGatewayUrl,
3299
3342
  aiGatewayHeaders,
3300
3343
  traceContext,
@@ -3303,18 +3346,26 @@ var OpenCodeAdapter = class {
3303
3346
  rules,
3304
3347
  systemPrompt
3305
3348
  } = context;
3349
+ const typed = config ? OpenCodeConfigSchema2.passthrough().safeParse(config) : void 0;
3350
+ const cfg = typed?.success ? typed.data : void 0;
3351
+ const rawMaxTurns = cfg?.maxTurns;
3352
+ const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
3306
3353
  const options = {
3307
3354
  cwd,
3308
- model: modelConfig?.model,
3309
- temperature: modelConfig?.temperature,
3310
- maxTurns: modelConfig?.maxTurns,
3355
+ model: cfg?.model,
3356
+ temperature: cfg?.temperature,
3357
+ topP: cfg?.topP,
3358
+ maxTurns,
3359
+ thinkingVariant: cfg?.thinkingVariant,
3360
+ maxDurationMs: cfg?.maxDurationMs,
3311
3361
  aiGatewayUrl,
3312
3362
  aiGatewayHeaders,
3313
3363
  traceContext,
3314
3364
  mcps,
3315
3365
  subAgents,
3316
3366
  rules,
3317
- systemPrompt
3367
+ systemPrompt,
3368
+ config
3318
3369
  };
3319
3370
  const { result, llmTrace, conversation } = await executeWithOpenCode(
3320
3371
  skills,
@@ -3346,12 +3397,16 @@ import {
3346
3397
  stepCountIs
3347
3398
  } from "ai";
3348
3399
  import { createAnthropic } from "@ai-sdk/anthropic";
3400
+ import { createGoogleGenerativeAI } from "@ai-sdk/google";
3349
3401
  import { createOpenAI } from "@ai-sdk/openai";
3350
3402
  import {
3351
3403
  AVAILABLE_CLAUDE_MODEL_IDS,
3404
+ AVAILABLE_GEMINI_MODEL_IDS as AVAILABLE_GEMINI_MODEL_IDS2,
3405
+ GEMINI_THINKING_MODEL_IDS,
3352
3406
  OPENAI_RESPONSES_MODEL_IDS,
3353
3407
  LLMStepType as LLMStepType3,
3354
- LiveTraceEventType as LiveTraceEventType3
3408
+ LiveTraceEventType as LiveTraceEventType3,
3409
+ SimpleAgentConfigSchema
3355
3410
  } from "@wix/evalforge-types";
3356
3411
  import { randomUUID as randomUUID3 } from "crypto";
3357
3412
 
@@ -3373,8 +3428,9 @@ async function buildMcpTools(mcps, cwd) {
3373
3428
  const client = await createMCPClient({ transport });
3374
3429
  clients.push(client);
3375
3430
  const tools = await client.tools();
3431
+ const safePrefix = serverName.replace(/[^a-zA-Z0-9]/g, "_");
3376
3432
  for (const [toolName, tool] of Object.entries(tools)) {
3377
- allTools[`${serverName}__${toolName}`] = tool;
3433
+ allTools[`${safePrefix}_${toolName}`] = tool;
3378
3434
  }
3379
3435
  }
3380
3436
  }
@@ -3450,6 +3506,7 @@ function extractErrorText(content) {
3450
3506
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
3451
3507
  import { normalizeModelId } from "@wix/evalforge-types";
3452
3508
  var PROVIDER_ANTHROPIC = "anthropic";
3509
+ var PROVIDER_GEMINI = "gemini";
3453
3510
  var MODEL_PRICING = {
3454
3511
  // Anthropic — Claude 4.6
3455
3512
  "claude-sonnet-4-6": { input: 3, output: 15 },
@@ -3476,7 +3533,18 @@ var MODEL_PRICING = {
3476
3533
  o3: { input: 2, output: 8 },
3477
3534
  "o4-mini": { input: 1.1, output: 4.4 },
3478
3535
  "o3-mini": { input: 1.1, output: 4.4 },
3479
- o1: { input: 15, output: 60 }
3536
+ o1: { input: 15, output: 60 },
3537
+ // Google Gemini 2.0
3538
+ "gemini-2.0-flash": { input: 0.1, output: 0.4 },
3539
+ "gemini-2.0-flash-lite": { input: 0.075, output: 0.3 },
3540
+ // Google Gemini 2.5
3541
+ "gemini-2.5-pro": { input: 1.25, output: 10 },
3542
+ "gemini-2.5-flash": { input: 0.15, output: 0.6 },
3543
+ "gemini-2.5-flash-lite": { input: 0.075, output: 0.3 },
3544
+ // Google Gemini 3.x — standard pricing up to 200K context tokens
3545
+ "gemini-3-pro-preview": { input: 2, output: 12 },
3546
+ "gemini-3-flash-preview": { input: 0.5, output: 3 },
3547
+ "gemini-3.1-pro-preview": { input: 2, output: 12 }
3480
3548
  };
3481
3549
  function extractGatewayCost(step, provider) {
3482
3550
  try {
@@ -3487,6 +3555,15 @@ function extractGatewayCost(step, provider) {
3487
3555
  const cost2 = usage?.total_cost_usd;
3488
3556
  return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
3489
3557
  }
3558
+ if (provider === PROVIDER_GEMINI) {
3559
+ const meta = step.providerMetadata;
3560
+ const google = meta?.google;
3561
+ const cost2 = google?.total_cost_usd;
3562
+ if (typeof cost2 === "number" && cost2 > 0) return cost2;
3563
+ const body2 = step.response?.body;
3564
+ const bodyCost = body2?.total_cost_usd;
3565
+ return typeof bodyCost === "number" && bodyCost > 0 ? bodyCost : void 0;
3566
+ }
3490
3567
  const body = step.response?.body;
3491
3568
  const cost = body?.total_cost_usd;
3492
3569
  return typeof cost === "number" && cost > 0 ? cost : void 0;
@@ -3564,10 +3641,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3564
3641
  // src/run-scenario/agents/simple-agent/execute.ts
3565
3642
  var PROVIDER_ANTHROPIC2 = "anthropic";
3566
3643
  var PROVIDER_OPENAI = "openai";
3644
+ var PROVIDER_GEMINI2 = "gemini";
3567
3645
  var DEFAULT_MAX_TOOL_STEPS = 25;
3568
3646
  function createModel(modelId, baseUrl, headers) {
3569
- const isClaudeModel = isClaudeModelId(modelId);
3570
- if (isClaudeModel) {
3647
+ if (isClaudeModelId(modelId)) {
3571
3648
  const anthropic = createAnthropic({
3572
3649
  baseURL: `${baseUrl}/proxy/anthropic`,
3573
3650
  apiKey: "proxy-auth",
@@ -3575,6 +3652,14 @@ function createModel(modelId, baseUrl, headers) {
3575
3652
  });
3576
3653
  return anthropic(modelId);
3577
3654
  }
3655
+ if (isGeminiModelId(modelId)) {
3656
+ const google = createGoogleGenerativeAI({
3657
+ baseURL: `${baseUrl}/proxy/gemini`,
3658
+ apiKey: "proxy-auth",
3659
+ headers
3660
+ });
3661
+ return google(modelId);
3662
+ }
3578
3663
  const openai = createOpenAI({
3579
3664
  baseURL: `${baseUrl}/proxy/openai`,
3580
3665
  apiKey: "proxy-auth",
@@ -3592,6 +3677,11 @@ function isClaudeModelId(modelId) {
3592
3677
  (id) => modelId === id || modelId.startsWith(id)
3593
3678
  );
3594
3679
  }
3680
+ function isGeminiModelId(modelId) {
3681
+ return AVAILABLE_GEMINI_MODEL_IDS2.some(
3682
+ (id) => modelId === id || modelId.startsWith(id)
3683
+ );
3684
+ }
3595
3685
  function extractSkillContent(files) {
3596
3686
  if (!files || files.length === 0) return void 0;
3597
3687
  const skillMd = files.find((f) => f.path === "SKILL.md");
@@ -3602,20 +3692,30 @@ async function executeWithAiSdk(context) {
3602
3692
  const {
3603
3693
  scenario,
3604
3694
  cwd,
3605
- modelConfig,
3695
+ config,
3606
3696
  aiGatewayUrl,
3607
3697
  aiGatewayHeaders = {},
3608
3698
  mcps,
3609
3699
  traceContext
3610
3700
  } = context;
3701
+ const typed = config ? SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
3702
+ const cfg = typed?.success ? typed.data : void 0;
3703
+ const schemaKeys = new Set(Object.keys(SimpleAgentConfigSchema.shape));
3704
+ const configExtras = {};
3705
+ if (config) {
3706
+ for (const [key, value] of Object.entries(config)) {
3707
+ if (!schemaKeys.has(key)) configExtras[key] = value;
3708
+ }
3709
+ }
3611
3710
  if (!aiGatewayUrl) {
3612
3711
  throw new Error("Simple Agent requires aiGatewayUrl");
3613
3712
  }
3614
- if (!modelConfig?.model) {
3615
- throw new Error("Simple Agent requires a model in modelConfig");
3713
+ if (!cfg?.model) {
3714
+ throw new Error("Simple Agent requires a model in config");
3616
3715
  }
3617
- const model = createModel(modelConfig.model, aiGatewayUrl, aiGatewayHeaders);
3618
- const provider = isClaudeModelId(modelConfig.model) ? PROVIDER_ANTHROPIC2 : PROVIDER_OPENAI;
3716
+ const modelId = cfg.model;
3717
+ const model = createModel(modelId, aiGatewayUrl, aiGatewayHeaders);
3718
+ const provider = isClaudeModelId(modelId) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelId) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
3619
3719
  const systemPrompt = composeSystemPrompt(context);
3620
3720
  const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
3621
3721
  const startTime = Date.now();
@@ -3625,13 +3725,17 @@ async function executeWithAiSdk(context) {
3625
3725
  try {
3626
3726
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
3627
3727
  const isResponsesAPI = [...OPENAI_RESPONSES_MODEL_IDS].some(
3628
- (id) => modelConfig.model === id || modelConfig.model.startsWith(id)
3728
+ (id) => modelId === id || modelId.startsWith(id)
3629
3729
  );
3630
- const supportsThinking = isAnthropic || isResponsesAPI;
3631
- const providerOpts = {
3730
+ const isGemini = provider === PROVIDER_GEMINI2;
3731
+ const isGeminiThinking = isGemini && GEMINI_THINKING_MODEL_IDS.has(modelId);
3732
+ const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
3733
+ const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
3734
+ const reasoningEffort = cfg.reasoningEffort ?? "high";
3735
+ const computedProviderOpts = {
3632
3736
  ...isAnthropic && {
3633
3737
  anthropic: {
3634
- thinking: { type: "enabled", budgetTokens: 1e4 }
3738
+ thinking: { type: "enabled", budgetTokens: thinkingBudgetTokens }
3635
3739
  }
3636
3740
  },
3637
3741
  ...isResponsesAPI && {
@@ -3639,15 +3743,30 @@ async function executeWithAiSdk(context) {
3639
3743
  // Prevent the SDK from sending item_reference inputs that the proxy can't forward
3640
3744
  store: false,
3641
3745
  forceReasoning: true,
3642
- reasoningEffort: "high",
3746
+ reasoningEffort,
3643
3747
  reasoningSummary: "detailed"
3644
3748
  }
3749
+ },
3750
+ ...isGeminiThinking && {
3751
+ google: {
3752
+ thinkingConfig: {
3753
+ includeThoughts: true,
3754
+ thinkingBudget: 1e4
3755
+ }
3756
+ }
3645
3757
  }
3646
3758
  };
3759
+ const { providerOptions: extraProviderOptions, ...topLevelExtras } = configExtras;
3760
+ const mergedProviderOptions = {
3761
+ ...extraProviderOptions && typeof extraProviderOptions === "object" ? extraProviderOptions : {},
3762
+ ...computedProviderOpts
3763
+ };
3647
3764
  const stepTimestamps = [];
3765
+ const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3648
3766
  const { triggerPromptImages } = context;
3649
3767
  const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
3650
- const generateTextParams = {
3768
+ const result = await generateText({
3769
+ ...topLevelExtras,
3651
3770
  model,
3652
3771
  system: systemPrompt,
3653
3772
  ...hasImages ? {
@@ -3665,11 +3784,16 @@ async function executeWithAiSdk(context) {
3665
3784
  }
3666
3785
  ]
3667
3786
  } : { prompt: scenario.triggerPrompt },
3668
- temperature: supportsThinking ? void 0 : modelConfig.temperature,
3669
- maxOutputTokens: modelConfig.maxTokens,
3787
+ temperature: supportsThinking ? void 0 : cfg.temperature,
3788
+ topP: supportsThinking ? void 0 : cfg.topP,
3789
+ frequencyPenalty: cfg.frequencyPenalty,
3790
+ presencePenalty: cfg.presencePenalty,
3791
+ seed: cfg.seed,
3792
+ stopSequences: cfg.stopSequences,
3793
+ maxOutputTokens: cfg.maxTokens,
3670
3794
  tools: mcpTools,
3671
- stopWhen: mcpTools ? stepCountIs(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : stepCountIs(1),
3672
- providerOptions: providerOpts,
3795
+ ...mcpTools && effectiveMaxTurns != null ? { stopWhen: stepCountIs(effectiveMaxTurns) } : !mcpTools ? { stopWhen: stepCountIs(1) } : {},
3796
+ providerOptions: mergedProviderOptions,
3673
3797
  onStepFinish: (step) => {
3674
3798
  stepTimestamps.push(Date.now());
3675
3799
  if (traceContext) {
@@ -3697,8 +3821,7 @@ async function executeWithAiSdk(context) {
3697
3821
  );
3698
3822
  }
3699
3823
  }
3700
- };
3701
- const result = await generateText(generateTextParams);
3824
+ });
3702
3825
  const durationMs = Date.now() - startTime;
3703
3826
  const usage = {
3704
3827
  inputTokens: result.usage.inputTokens ?? 0,
@@ -3709,7 +3832,7 @@ async function executeWithAiSdk(context) {
3709
3832
  result.steps,
3710
3833
  durationMs,
3711
3834
  usage,
3712
- modelConfig.model,
3835
+ modelId,
3713
3836
  provider,
3714
3837
  startTime,
3715
3838
  stepTimestamps
@@ -4622,11 +4745,12 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4622
4745
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4623
4746
  const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
4624
4747
  const targetName = evalData.presetName || agent?.name || "";
4748
+ const agentConfig = agent?.config;
4625
4749
  const executionContext = {
4626
4750
  skills: evalData.skills,
4627
4751
  scenario,
4628
4752
  cwd: workDir || process.cwd(),
4629
- modelConfig: agent?.modelConfig,
4753
+ config: agentConfig,
4630
4754
  aiGatewayUrl: config.aiGatewayUrl,
4631
4755
  aiGatewayHeaders: config.aiGatewayHeaders,
4632
4756
  traceContext: {
@@ -4661,7 +4785,18 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4661
4785
  infrastructurePaths
4662
4786
  );
4663
4787
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4664
- const resolvedModelConfig = agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4788
+ const snapshotModelConfig = agentConfig?.model ? {
4789
+ model: agentConfig.model,
4790
+ ...agentConfig.temperature != null && {
4791
+ temperature: agentConfig.temperature
4792
+ },
4793
+ ...agentConfig.maxTokens != null && {
4794
+ maxTokens: agentConfig.maxTokens
4795
+ },
4796
+ ...agentConfig.maxTurns != null && {
4797
+ maxTurns: agentConfig.maxTurns
4798
+ }
4799
+ } : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4665
4800
  return {
4666
4801
  id: randomUUID4(),
4667
4802
  targetId,
@@ -4669,7 +4804,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4669
4804
  scenarioId: scenario.id,
4670
4805
  scenarioName: scenario.name,
4671
4806
  triggerPrompt: scenario.triggerPrompt,
4672
- modelConfig: resolvedModelConfig,
4807
+ modelConfig: snapshotModelConfig,
4673
4808
  duration: durationMs,
4674
4809
  outputText,
4675
4810
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,