@wix/evalforge-evaluator 0.146.0 → 0.148.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -330,10 +330,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
330
330
  if (evalRun.agentId) {
331
331
  agent = await api.getAgent(projectId2, evalRun.agentId);
332
332
  }
333
- let skills = [];
334
- let mcps = [];
335
- let subAgents = [];
336
- let rules = [];
333
+ const skills = [];
334
+ const mcps = [];
335
+ const subAgents = [];
336
+ const rules = [];
337
337
  if (evalRun.capabilityIds && evalRun.capabilityIds.length > 0) {
338
338
  const fetchResults = await Promise.allSettled(
339
339
  evalRun.capabilityIds.map((id) => api.getCapability(projectId2, id))
@@ -1190,10 +1190,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
1190
1190
  let lastAction = "Starting...";
1191
1191
  let lastToolName;
1192
1192
  let lastFilePath;
1193
- const maxTurns = options.maxTurns ?? 10;
1193
+ const maxTurns = options.maxTurns || void 0;
1194
1194
  let messageCount = 0;
1195
1195
  const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
1196
- const baseAllowedTools = [
1196
+ const baseAllowedTools = options.allowedTools ?? [
1197
1197
  "Skill",
1198
1198
  "Read",
1199
1199
  "Write",
@@ -1207,13 +1207,17 @@ async function executeWithClaudeCode(skills, scenario, options) {
1207
1207
  ];
1208
1208
  const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
1209
1209
  const queryOptions = {
1210
+ ...options.extras ?? {},
1210
1211
  env: sdkEnv,
1211
1212
  cwd: options.cwd,
1212
1213
  settingSources: ["project"],
1213
1214
  allowedTools,
1215
+ ...options.disallowedTools?.length ? { disallowedTools: options.disallowedTools } : {},
1214
1216
  model: options.model || DEFAULT_MODEL,
1215
1217
  maxTurns,
1216
1218
  maxThinkingTokens: options.maxThinkingTokens,
1219
+ ...options.effort ? { effort: options.effort } : {},
1220
+ ...options.maxBudgetUsd != null ? { maxBudgetUsd: options.maxBudgetUsd } : {},
1217
1221
  // Use 'default' permission mode with custom canUseTool handler
1218
1222
  // instead of 'bypassPermissions' which fails on root
1219
1223
  permissionMode: "default",
@@ -1288,7 +1292,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
1288
1292
  traceContext.authToken
1289
1293
  );
1290
1294
  }
1291
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
1295
+ const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
1292
1296
  let timeoutHandle;
1293
1297
  let timedOut = false;
1294
1298
  const HEARTBEAT_INTERVAL_MS = 1e4;
@@ -2052,7 +2056,7 @@ var ClaudeCodeAdapter = class {
2052
2056
  skills,
2053
2057
  scenario,
2054
2058
  cwd,
2055
- modelConfig,
2059
+ config,
2056
2060
  aiGatewayUrl,
2057
2061
  aiGatewayHeaders,
2058
2062
  traceContext,
@@ -2061,20 +2065,37 @@ var ClaudeCodeAdapter = class {
2061
2065
  rules,
2062
2066
  systemPrompt
2063
2067
  } = context;
2064
- const modelForSdk = modelConfig?.model;
2068
+ const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
2069
+ const cfg = typed?.success ? typed.data : void 0;
2070
+ const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
2071
+ const extras = {};
2072
+ if (config) {
2073
+ for (const [key, value] of Object.entries(config)) {
2074
+ if (!schemaKeys.has(key)) extras[key] = value;
2075
+ }
2076
+ }
2077
+ const rawMaxTurns = cfg?.maxTurns;
2078
+ const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
2065
2079
  const options = {
2066
2080
  cwd,
2067
- model: modelForSdk,
2068
- temperature: modelConfig?.temperature,
2069
- maxTokens: modelConfig?.maxTokens,
2070
- maxTurns: modelConfig?.maxTurns,
2081
+ model: cfg?.model,
2082
+ temperature: cfg?.temperature,
2083
+ maxTokens: cfg?.maxTokens,
2084
+ maxTurns,
2085
+ maxThinkingTokens: cfg?.maxThinkingTokens,
2086
+ allowedTools: cfg?.allowedTools,
2087
+ disallowedTools: cfg?.disallowedTools,
2088
+ effort: cfg?.effort,
2089
+ maxBudgetUsd: cfg?.maxBudgetUsd,
2090
+ maxDurationMs: cfg?.maxDurationMs,
2071
2091
  aiGatewayUrl,
2072
2092
  aiGatewayHeaders,
2073
2093
  traceContext,
2074
2094
  mcps,
2075
2095
  subAgents,
2076
2096
  rules,
2077
- systemPrompt
2097
+ systemPrompt,
2098
+ extras
2078
2099
  };
2079
2100
  const { result, llmTrace, conversation } = await executeWithClaudeCode(
2080
2101
  skills,
@@ -2214,18 +2235,29 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
2214
2235
  var import_os3 = require("os");
2215
2236
  var import_evalforge_types6 = require("@wix/evalforge-types");
2216
2237
  var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
2238
+ var OPENCODE_MODEL_ALIASES = {
2239
+ "claude-sonnet-4": "claude-sonnet-4-0",
2240
+ "claude-opus-4": "claude-opus-4-0"
2241
+ };
2217
2242
  function parseModel(model) {
2218
2243
  const slashIndex = model.indexOf("/");
2219
2244
  if (slashIndex > 0) {
2245
+ const providerID = model.slice(0, slashIndex);
2246
+ const rawModelID = model.slice(slashIndex + 1);
2220
2247
  return {
2221
- providerID: model.slice(0, slashIndex),
2222
- modelID: model.slice(slashIndex + 1)
2248
+ providerID,
2249
+ modelID: OPENCODE_MODEL_ALIASES[rawModelID] ?? rawModelID
2223
2250
  };
2224
2251
  }
2252
+ const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
2225
2253
  const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
2226
2254
  model
2227
2255
  );
2228
- return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
2256
+ const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
2257
+ model
2258
+ );
2259
+ if (isGemini) return { providerID: "google", modelID };
2260
+ return { providerID: isOpenAI ? "openai" : "anthropic", modelID };
2229
2261
  }
2230
2262
  function toOpenCodeMcpConfig(servers) {
2231
2263
  const result = {};
@@ -2276,8 +2308,9 @@ async function buildOpenCodeEnv(options) {
2276
2308
  const { providerID, modelID } = parseModel(modelStr);
2277
2309
  const provider = {};
2278
2310
  if (options.aiGatewayUrl) {
2311
+ const proxyPath = providerID === "google" ? "gemini" : providerID;
2279
2312
  const providerOptions = {
2280
- baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
2313
+ baseURL: `${options.aiGatewayUrl}/proxy/${proxyPath}`,
2281
2314
  apiKey: "sk-placeholder-auth-handled-by-gateway"
2282
2315
  };
2283
2316
  if (options.aiGatewayHeaders) {
@@ -2308,20 +2341,26 @@ async function buildOpenCodeEnv(options) {
2308
2341
  if (options.temperature != null) {
2309
2342
  agentOverrides.temperature = options.temperature;
2310
2343
  }
2311
- if (options.maxTurns != null) {
2344
+ if (options.topP != null) {
2345
+ agentOverrides.top_p = options.topP;
2346
+ }
2347
+ if (options.maxTurns != null && options.maxTurns > 0) {
2312
2348
  agentOverrides.maxSteps = options.maxTurns;
2313
2349
  }
2350
+ const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
2351
+ const configPermission = parsed?.success ? parsed.data.permission : void 0;
2352
+ const defaultPermission = {
2353
+ "*": "allow"
2354
+ };
2355
+ const permission = {
2356
+ ...defaultPermission,
2357
+ ...configPermission
2358
+ };
2314
2359
  const config = {
2315
2360
  model: `${providerID}/${modelID}`,
2316
2361
  provider,
2317
2362
  ...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
2318
- permission: {
2319
- edit: "allow",
2320
- bash: "allow",
2321
- webfetch: "allow",
2322
- doom_loop: "allow",
2323
- external_directory: "allow"
2324
- },
2363
+ permission,
2325
2364
  ...mcp ? { mcp } : {}
2326
2365
  };
2327
2366
  const env = {
@@ -3055,16 +3094,18 @@ async function executeWithOpenCode(skills, scenario, options) {
3055
3094
  hasAiGatewayHeaders: !!options.aiGatewayHeaders,
3056
3095
  model: options.model
3057
3096
  });
3058
- const maxTurns = options.maxTurns ?? 10;
3059
- const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
3097
+ const maxTurns = options.maxTurns || void 0;
3098
+ const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
3060
3099
  const { env, providerID, modelID } = await buildOpenCodeEnv({
3061
3100
  model: options.model,
3062
3101
  temperature: options.temperature,
3102
+ topP: options.topP,
3063
3103
  maxTurns,
3064
3104
  aiGatewayUrl: options.aiGatewayUrl,
3065
3105
  aiGatewayHeaders: options.aiGatewayHeaders,
3066
3106
  mcps: options.mcps,
3067
- cwd: options.cwd
3107
+ cwd: options.cwd,
3108
+ config: options.config
3068
3109
  });
3069
3110
  const startTime = /* @__PURE__ */ new Date();
3070
3111
  const traceContext = options.traceContext;
@@ -3092,13 +3133,13 @@ async function executeWithOpenCode(skills, scenario, options) {
3092
3133
  traceContext.authToken
3093
3134
  );
3094
3135
  }
3136
+ const variant = options.thinkingVariant ?? "high";
3137
+ const thinkingArgs = variant === "none" ? [] : ["--thinking", "--variant", variant];
3095
3138
  const baseArgs = [
3096
3139
  "run",
3097
3140
  "--format",
3098
3141
  "json",
3099
- "--thinking",
3100
- "--variant",
3101
- "high",
3142
+ ...thinkingArgs,
3102
3143
  "--model",
3103
3144
  `${providerID}/${modelID}`,
3104
3145
  "--dir",
@@ -3289,7 +3330,7 @@ var OpenCodeAdapter = class {
3289
3330
  skills,
3290
3331
  scenario,
3291
3332
  cwd,
3292
- modelConfig,
3333
+ config,
3293
3334
  aiGatewayUrl,
3294
3335
  aiGatewayHeaders,
3295
3336
  traceContext,
@@ -3298,18 +3339,26 @@ var OpenCodeAdapter = class {
3298
3339
  rules,
3299
3340
  systemPrompt
3300
3341
  } = context;
3342
+ const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
3343
+ const cfg = typed?.success ? typed.data : void 0;
3344
+ const rawMaxTurns = cfg?.maxTurns;
3345
+ const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
3301
3346
  const options = {
3302
3347
  cwd,
3303
- model: modelConfig?.model,
3304
- temperature: modelConfig?.temperature,
3305
- maxTurns: modelConfig?.maxTurns,
3348
+ model: cfg?.model,
3349
+ temperature: cfg?.temperature,
3350
+ topP: cfg?.topP,
3351
+ maxTurns,
3352
+ thinkingVariant: cfg?.thinkingVariant,
3353
+ maxDurationMs: cfg?.maxDurationMs,
3306
3354
  aiGatewayUrl,
3307
3355
  aiGatewayHeaders,
3308
3356
  traceContext,
3309
3357
  mcps,
3310
3358
  subAgents,
3311
3359
  rules,
3312
- systemPrompt
3360
+ systemPrompt,
3361
+ config
3313
3362
  };
3314
3363
  const { result, llmTrace, conversation } = await executeWithOpenCode(
3315
3364
  skills,
@@ -3338,6 +3387,7 @@ defaultRegistry.register(openCodeAdapter);
3338
3387
  // src/run-scenario/agents/simple-agent/execute.ts
3339
3388
  var import_ai = require("ai");
3340
3389
  var import_anthropic = require("@ai-sdk/anthropic");
3390
+ var import_google = require("@ai-sdk/google");
3341
3391
  var import_openai = require("@ai-sdk/openai");
3342
3392
  var import_evalforge_types11 = require("@wix/evalforge-types");
3343
3393
  var import_crypto3 = require("crypto");
@@ -3360,8 +3410,9 @@ async function buildMcpTools(mcps, cwd) {
3360
3410
  const client = await (0, import_mcp.createMCPClient)({ transport });
3361
3411
  clients.push(client);
3362
3412
  const tools = await client.tools();
3413
+ const safePrefix = serverName.replace(/[^a-zA-Z0-9]/g, "_");
3363
3414
  for (const [toolName, tool] of Object.entries(tools)) {
3364
- allTools[`${serverName}__${toolName}`] = tool;
3415
+ allTools[`${safePrefix}_${toolName}`] = tool;
3365
3416
  }
3366
3417
  }
3367
3418
  }
@@ -3437,6 +3488,7 @@ function extractErrorText(content) {
3437
3488
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
3438
3489
  var import_evalforge_types10 = require("@wix/evalforge-types");
3439
3490
  var PROVIDER_ANTHROPIC = "anthropic";
3491
+ var PROVIDER_GEMINI = "gemini";
3440
3492
  var MODEL_PRICING = {
3441
3493
  // Anthropic — Claude 4.6
3442
3494
  "claude-sonnet-4-6": { input: 3, output: 15 },
@@ -3463,7 +3515,18 @@ var MODEL_PRICING = {
3463
3515
  o3: { input: 2, output: 8 },
3464
3516
  "o4-mini": { input: 1.1, output: 4.4 },
3465
3517
  "o3-mini": { input: 1.1, output: 4.4 },
3466
- o1: { input: 15, output: 60 }
3518
+ o1: { input: 15, output: 60 },
3519
+ // Google Gemini 2.0
3520
+ "gemini-2.0-flash": { input: 0.1, output: 0.4 },
3521
+ "gemini-2.0-flash-lite": { input: 0.075, output: 0.3 },
3522
+ // Google Gemini 2.5
3523
+ "gemini-2.5-pro": { input: 1.25, output: 10 },
3524
+ "gemini-2.5-flash": { input: 0.15, output: 0.6 },
3525
+ "gemini-2.5-flash-lite": { input: 0.075, output: 0.3 },
3526
+ // Google Gemini 3.x — standard pricing up to 200K context tokens
3527
+ "gemini-3-pro-preview": { input: 2, output: 12 },
3528
+ "gemini-3-flash-preview": { input: 0.5, output: 3 },
3529
+ "gemini-3.1-pro-preview": { input: 2, output: 12 }
3467
3530
  };
3468
3531
  function extractGatewayCost(step, provider) {
3469
3532
  try {
@@ -3474,6 +3537,15 @@ function extractGatewayCost(step, provider) {
3474
3537
  const cost2 = usage?.total_cost_usd;
3475
3538
  return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
3476
3539
  }
3540
+ if (provider === PROVIDER_GEMINI) {
3541
+ const meta = step.providerMetadata;
3542
+ const google = meta?.google;
3543
+ const cost2 = google?.total_cost_usd;
3544
+ if (typeof cost2 === "number" && cost2 > 0) return cost2;
3545
+ const body2 = step.response?.body;
3546
+ const bodyCost = body2?.total_cost_usd;
3547
+ return typeof bodyCost === "number" && bodyCost > 0 ? bodyCost : void 0;
3548
+ }
3477
3549
  const body = step.response?.body;
3478
3550
  const cost = body?.total_cost_usd;
3479
3551
  return typeof cost === "number" && cost > 0 ? cost : void 0;
@@ -3551,10 +3623,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3551
3623
  // src/run-scenario/agents/simple-agent/execute.ts
3552
3624
  var PROVIDER_ANTHROPIC2 = "anthropic";
3553
3625
  var PROVIDER_OPENAI = "openai";
3626
+ var PROVIDER_GEMINI2 = "gemini";
3554
3627
  var DEFAULT_MAX_TOOL_STEPS = 25;
3555
3628
  function createModel(modelId, baseUrl, headers) {
3556
- const isClaudeModel = isClaudeModelId(modelId);
3557
- if (isClaudeModel) {
3629
+ if (isClaudeModelId(modelId)) {
3558
3630
  const anthropic = (0, import_anthropic.createAnthropic)({
3559
3631
  baseURL: `${baseUrl}/proxy/anthropic`,
3560
3632
  apiKey: "proxy-auth",
@@ -3562,6 +3634,14 @@ function createModel(modelId, baseUrl, headers) {
3562
3634
  });
3563
3635
  return anthropic(modelId);
3564
3636
  }
3637
+ if (isGeminiModelId(modelId)) {
3638
+ const google = (0, import_google.createGoogleGenerativeAI)({
3639
+ baseURL: `${baseUrl}/proxy/gemini`,
3640
+ apiKey: "proxy-auth",
3641
+ headers
3642
+ });
3643
+ return google(modelId);
3644
+ }
3565
3645
  const openai = (0, import_openai.createOpenAI)({
3566
3646
  baseURL: `${baseUrl}/proxy/openai`,
3567
3647
  apiKey: "proxy-auth",
@@ -3579,6 +3659,11 @@ function isClaudeModelId(modelId) {
3579
3659
  (id) => modelId === id || modelId.startsWith(id)
3580
3660
  );
3581
3661
  }
3662
+ function isGeminiModelId(modelId) {
3663
+ return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
3664
+ (id) => modelId === id || modelId.startsWith(id)
3665
+ );
3666
+ }
3582
3667
  function extractSkillContent(files) {
3583
3668
  if (!files || files.length === 0) return void 0;
3584
3669
  const skillMd = files.find((f) => f.path === "SKILL.md");
@@ -3589,20 +3674,30 @@ async function executeWithAiSdk(context) {
3589
3674
  const {
3590
3675
  scenario,
3591
3676
  cwd,
3592
- modelConfig,
3677
+ config,
3593
3678
  aiGatewayUrl,
3594
3679
  aiGatewayHeaders = {},
3595
3680
  mcps,
3596
3681
  traceContext
3597
3682
  } = context;
3683
+ const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
3684
+ const cfg = typed?.success ? typed.data : void 0;
3685
+ const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
3686
+ const configExtras = {};
3687
+ if (config) {
3688
+ for (const [key, value] of Object.entries(config)) {
3689
+ if (!schemaKeys.has(key)) configExtras[key] = value;
3690
+ }
3691
+ }
3598
3692
  if (!aiGatewayUrl) {
3599
3693
  throw new Error("Simple Agent requires aiGatewayUrl");
3600
3694
  }
3601
- if (!modelConfig?.model) {
3602
- throw new Error("Simple Agent requires a model in modelConfig");
3695
+ if (!cfg?.model) {
3696
+ throw new Error("Simple Agent requires a model in config");
3603
3697
  }
3604
- const model = createModel(modelConfig.model, aiGatewayUrl, aiGatewayHeaders);
3605
- const provider = isClaudeModelId(modelConfig.model) ? PROVIDER_ANTHROPIC2 : PROVIDER_OPENAI;
3698
+ const modelId = cfg.model;
3699
+ const model = createModel(modelId, aiGatewayUrl, aiGatewayHeaders);
3700
+ const provider = isClaudeModelId(modelId) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelId) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
3606
3701
  const systemPrompt = composeSystemPrompt(context);
3607
3702
  const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
3608
3703
  const startTime = Date.now();
@@ -3612,13 +3707,17 @@ async function executeWithAiSdk(context) {
3612
3707
  try {
3613
3708
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
3614
3709
  const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
3615
- (id) => modelConfig.model === id || modelConfig.model.startsWith(id)
3710
+ (id) => modelId === id || modelId.startsWith(id)
3616
3711
  );
3617
- const supportsThinking = isAnthropic || isResponsesAPI;
3618
- const providerOpts = {
3712
+ const isGemini = provider === PROVIDER_GEMINI2;
3713
+ const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
3714
+ const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
3715
+ const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
3716
+ const reasoningEffort = cfg.reasoningEffort ?? "high";
3717
+ const computedProviderOpts = {
3619
3718
  ...isAnthropic && {
3620
3719
  anthropic: {
3621
- thinking: { type: "enabled", budgetTokens: 1e4 }
3720
+ thinking: { type: "enabled", budgetTokens: thinkingBudgetTokens }
3622
3721
  }
3623
3722
  },
3624
3723
  ...isResponsesAPI && {
@@ -3626,15 +3725,30 @@ async function executeWithAiSdk(context) {
3626
3725
  // Prevent the SDK from sending item_reference inputs that the proxy can't forward
3627
3726
  store: false,
3628
3727
  forceReasoning: true,
3629
- reasoningEffort: "high",
3728
+ reasoningEffort,
3630
3729
  reasoningSummary: "detailed"
3631
3730
  }
3731
+ },
3732
+ ...isGeminiThinking && {
3733
+ google: {
3734
+ thinkingConfig: {
3735
+ includeThoughts: true,
3736
+ thinkingBudget: 1e4
3737
+ }
3738
+ }
3632
3739
  }
3633
3740
  };
3741
+ const { providerOptions: extraProviderOptions, ...topLevelExtras } = configExtras;
3742
+ const mergedProviderOptions = {
3743
+ ...extraProviderOptions && typeof extraProviderOptions === "object" ? extraProviderOptions : {},
3744
+ ...computedProviderOpts
3745
+ };
3634
3746
  const stepTimestamps = [];
3747
+ const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3635
3748
  const { triggerPromptImages } = context;
3636
3749
  const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
3637
- const generateTextParams = {
3750
+ const result = await (0, import_ai.generateText)({
3751
+ ...topLevelExtras,
3638
3752
  model,
3639
3753
  system: systemPrompt,
3640
3754
  ...hasImages ? {
@@ -3652,11 +3766,16 @@ async function executeWithAiSdk(context) {
3652
3766
  }
3653
3767
  ]
3654
3768
  } : { prompt: scenario.triggerPrompt },
3655
- temperature: supportsThinking ? void 0 : modelConfig.temperature,
3656
- maxOutputTokens: modelConfig.maxTokens,
3769
+ temperature: supportsThinking ? void 0 : cfg.temperature,
3770
+ topP: supportsThinking ? void 0 : cfg.topP,
3771
+ frequencyPenalty: cfg.frequencyPenalty,
3772
+ presencePenalty: cfg.presencePenalty,
3773
+ seed: cfg.seed,
3774
+ stopSequences: cfg.stopSequences,
3775
+ maxOutputTokens: cfg.maxTokens,
3657
3776
  tools: mcpTools,
3658
- stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
3659
- providerOptions: providerOpts,
3777
+ ...mcpTools && effectiveMaxTurns != null ? { stopWhen: (0, import_ai.stepCountIs)(effectiveMaxTurns) } : !mcpTools ? { stopWhen: (0, import_ai.stepCountIs)(1) } : {},
3778
+ providerOptions: mergedProviderOptions,
3660
3779
  onStepFinish: (step) => {
3661
3780
  stepTimestamps.push(Date.now());
3662
3781
  if (traceContext) {
@@ -3684,8 +3803,7 @@ async function executeWithAiSdk(context) {
3684
3803
  );
3685
3804
  }
3686
3805
  }
3687
- };
3688
- const result = await (0, import_ai.generateText)(generateTextParams);
3806
+ });
3689
3807
  const durationMs = Date.now() - startTime;
3690
3808
  const usage = {
3691
3809
  inputTokens: result.usage.inputTokens ?? 0,
@@ -3696,7 +3814,7 @@ async function executeWithAiSdk(context) {
3696
3814
  result.steps,
3697
3815
  durationMs,
3698
3816
  usage,
3699
- modelConfig.model,
3817
+ modelId,
3700
3818
  provider,
3701
3819
  startTime,
3702
3820
  stepTimestamps
@@ -4609,11 +4727,12 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4609
4727
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4610
4728
  const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
4611
4729
  const targetName = evalData.presetName || agent?.name || "";
4730
+ const agentConfig = agent?.config;
4612
4731
  const executionContext = {
4613
4732
  skills: evalData.skills,
4614
4733
  scenario,
4615
4734
  cwd: workDir || process.cwd(),
4616
- modelConfig: agent?.modelConfig,
4735
+ config: agentConfig,
4617
4736
  aiGatewayUrl: config.aiGatewayUrl,
4618
4737
  aiGatewayHeaders: config.aiGatewayHeaders,
4619
4738
  traceContext: {
@@ -4648,7 +4767,18 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4648
4767
  infrastructurePaths
4649
4768
  );
4650
4769
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4651
- const resolvedModelConfig = agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4770
+ const snapshotModelConfig = agentConfig?.model ? {
4771
+ model: agentConfig.model,
4772
+ ...agentConfig.temperature != null && {
4773
+ temperature: agentConfig.temperature
4774
+ },
4775
+ ...agentConfig.maxTokens != null && {
4776
+ maxTokens: agentConfig.maxTokens
4777
+ },
4778
+ ...agentConfig.maxTurns != null && {
4779
+ maxTurns: agentConfig.maxTurns
4780
+ }
4781
+ } : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4652
4782
  return {
4653
4783
  id: (0, import_crypto4.randomUUID)(),
4654
4784
  targetId,
@@ -4656,7 +4786,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4656
4786
  scenarioId: scenario.id,
4657
4787
  scenarioName: scenario.name,
4658
4788
  triggerPrompt: scenario.triggerPrompt,
4659
- modelConfig: resolvedModelConfig,
4789
+ modelConfig: snapshotModelConfig,
4660
4790
  duration: durationMs,
4661
4791
  outputText,
4662
4792
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,