@wix/evalforge-evaluator 0.147.0 → 0.148.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -330,10 +330,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
330
330
  if (evalRun.agentId) {
331
331
  agent = await api.getAgent(projectId2, evalRun.agentId);
332
332
  }
333
- let skills = [];
334
- let mcps = [];
335
- let subAgents = [];
336
- let rules = [];
333
+ const skills = [];
334
+ const mcps = [];
335
+ const subAgents = [];
336
+ const rules = [];
337
337
  if (evalRun.capabilityIds && evalRun.capabilityIds.length > 0) {
338
338
  const fetchResults = await Promise.allSettled(
339
339
  evalRun.capabilityIds.map((id) => api.getCapability(projectId2, id))
@@ -1190,10 +1190,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
1190
1190
  let lastAction = "Starting...";
1191
1191
  let lastToolName;
1192
1192
  let lastFilePath;
1193
- const maxTurns = options.maxTurns ?? 10;
1193
+ const maxTurns = options.maxTurns || void 0;
1194
1194
  let messageCount = 0;
1195
1195
  const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
1196
- const baseAllowedTools = [
1196
+ const baseAllowedTools = options.allowedTools ?? [
1197
1197
  "Skill",
1198
1198
  "Read",
1199
1199
  "Write",
@@ -1207,13 +1207,17 @@ async function executeWithClaudeCode(skills, scenario, options) {
1207
1207
  ];
1208
1208
  const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
1209
1209
  const queryOptions = {
1210
+ ...options.extras ?? {},
1210
1211
  env: sdkEnv,
1211
1212
  cwd: options.cwd,
1212
1213
  settingSources: ["project"],
1213
1214
  allowedTools,
1215
+ ...options.disallowedTools?.length ? { disallowedTools: options.disallowedTools } : {},
1214
1216
  model: options.model || DEFAULT_MODEL,
1215
1217
  maxTurns,
1216
1218
  maxThinkingTokens: options.maxThinkingTokens,
1219
+ ...options.effort ? { effort: options.effort } : {},
1220
+ ...options.maxBudgetUsd != null ? { maxBudgetUsd: options.maxBudgetUsd } : {},
1217
1221
  // Use 'default' permission mode with custom canUseTool handler
1218
1222
  // instead of 'bypassPermissions' which fails on root
1219
1223
  permissionMode: "default",
@@ -1288,7 +1292,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
1288
1292
  traceContext.authToken
1289
1293
  );
1290
1294
  }
1291
- const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
1295
+ const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
1292
1296
  let timeoutHandle;
1293
1297
  let timedOut = false;
1294
1298
  const HEARTBEAT_INTERVAL_MS = 1e4;
@@ -2052,7 +2056,7 @@ var ClaudeCodeAdapter = class {
2052
2056
  skills,
2053
2057
  scenario,
2054
2058
  cwd,
2055
- modelConfig,
2059
+ config,
2056
2060
  aiGatewayUrl,
2057
2061
  aiGatewayHeaders,
2058
2062
  traceContext,
@@ -2061,20 +2065,37 @@ var ClaudeCodeAdapter = class {
2061
2065
  rules,
2062
2066
  systemPrompt
2063
2067
  } = context;
2064
- const modelForSdk = modelConfig?.model;
2068
+ const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
2069
+ const cfg = typed?.success ? typed.data : void 0;
2070
+ const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
2071
+ const extras = {};
2072
+ if (config) {
2073
+ for (const [key, value] of Object.entries(config)) {
2074
+ if (!schemaKeys.has(key)) extras[key] = value;
2075
+ }
2076
+ }
2077
+ const rawMaxTurns = cfg?.maxTurns;
2078
+ const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
2065
2079
  const options = {
2066
2080
  cwd,
2067
- model: modelForSdk,
2068
- temperature: modelConfig?.temperature,
2069
- maxTokens: modelConfig?.maxTokens,
2070
- maxTurns: modelConfig?.maxTurns,
2081
+ model: cfg?.model,
2082
+ temperature: cfg?.temperature,
2083
+ maxTokens: cfg?.maxTokens,
2084
+ maxTurns,
2085
+ maxThinkingTokens: cfg?.maxThinkingTokens,
2086
+ allowedTools: cfg?.allowedTools,
2087
+ disallowedTools: cfg?.disallowedTools,
2088
+ effort: cfg?.effort,
2089
+ maxBudgetUsd: cfg?.maxBudgetUsd,
2090
+ maxDurationMs: cfg?.maxDurationMs,
2071
2091
  aiGatewayUrl,
2072
2092
  aiGatewayHeaders,
2073
2093
  traceContext,
2074
2094
  mcps,
2075
2095
  subAgents,
2076
2096
  rules,
2077
- systemPrompt
2097
+ systemPrompt,
2098
+ extras
2078
2099
  };
2079
2100
  const { result, llmTrace, conversation } = await executeWithClaudeCode(
2080
2101
  skills,
@@ -2214,22 +2235,29 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
2214
2235
  var import_os3 = require("os");
2215
2236
  var import_evalforge_types6 = require("@wix/evalforge-types");
2216
2237
  var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
2238
+ var OPENCODE_MODEL_ALIASES = {
2239
+ "claude-sonnet-4": "claude-sonnet-4-0",
2240
+ "claude-opus-4": "claude-opus-4-0"
2241
+ };
2217
2242
  function parseModel(model) {
2218
2243
  const slashIndex = model.indexOf("/");
2219
2244
  if (slashIndex > 0) {
2245
+ const providerID = model.slice(0, slashIndex);
2246
+ const rawModelID = model.slice(slashIndex + 1);
2220
2247
  return {
2221
- providerID: model.slice(0, slashIndex),
2222
- modelID: model.slice(slashIndex + 1)
2248
+ providerID,
2249
+ modelID: OPENCODE_MODEL_ALIASES[rawModelID] ?? rawModelID
2223
2250
  };
2224
2251
  }
2252
+ const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
2225
2253
  const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
2226
2254
  model
2227
2255
  );
2228
2256
  const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
2229
2257
  model
2230
2258
  );
2231
- if (isGemini) return { providerID: "google", modelID: model };
2232
- return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
2259
+ if (isGemini) return { providerID: "google", modelID };
2260
+ return { providerID: isOpenAI ? "openai" : "anthropic", modelID };
2233
2261
  }
2234
2262
  function toOpenCodeMcpConfig(servers) {
2235
2263
  const result = {};
@@ -2313,20 +2341,26 @@ async function buildOpenCodeEnv(options) {
2313
2341
  if (options.temperature != null) {
2314
2342
  agentOverrides.temperature = options.temperature;
2315
2343
  }
2316
- if (options.maxTurns != null) {
2344
+ if (options.topP != null) {
2345
+ agentOverrides.top_p = options.topP;
2346
+ }
2347
+ if (options.maxTurns != null && options.maxTurns > 0) {
2317
2348
  agentOverrides.maxSteps = options.maxTurns;
2318
2349
  }
2350
+ const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
2351
+ const configPermission = parsed?.success ? parsed.data.permission : void 0;
2352
+ const defaultPermission = {
2353
+ "*": "allow"
2354
+ };
2355
+ const permission = {
2356
+ ...defaultPermission,
2357
+ ...configPermission
2358
+ };
2319
2359
  const config = {
2320
2360
  model: `${providerID}/${modelID}`,
2321
2361
  provider,
2322
2362
  ...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
2323
- permission: {
2324
- edit: "allow",
2325
- bash: "allow",
2326
- webfetch: "allow",
2327
- doom_loop: "allow",
2328
- external_directory: "allow"
2329
- },
2363
+ permission,
2330
2364
  ...mcp ? { mcp } : {}
2331
2365
  };
2332
2366
  const env = {
@@ -3060,16 +3094,18 @@ async function executeWithOpenCode(skills, scenario, options) {
3060
3094
  hasAiGatewayHeaders: !!options.aiGatewayHeaders,
3061
3095
  model: options.model
3062
3096
  });
3063
- const maxTurns = options.maxTurns ?? 10;
3064
- const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
3097
+ const maxTurns = options.maxTurns || void 0;
3098
+ const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
3065
3099
  const { env, providerID, modelID } = await buildOpenCodeEnv({
3066
3100
  model: options.model,
3067
3101
  temperature: options.temperature,
3102
+ topP: options.topP,
3068
3103
  maxTurns,
3069
3104
  aiGatewayUrl: options.aiGatewayUrl,
3070
3105
  aiGatewayHeaders: options.aiGatewayHeaders,
3071
3106
  mcps: options.mcps,
3072
- cwd: options.cwd
3107
+ cwd: options.cwd,
3108
+ config: options.config
3073
3109
  });
3074
3110
  const startTime = /* @__PURE__ */ new Date();
3075
3111
  const traceContext = options.traceContext;
@@ -3097,13 +3133,13 @@ async function executeWithOpenCode(skills, scenario, options) {
3097
3133
  traceContext.authToken
3098
3134
  );
3099
3135
  }
3136
+ const variant = options.thinkingVariant ?? "high";
3137
+ const thinkingArgs = variant === "none" ? [] : ["--thinking", "--variant", variant];
3100
3138
  const baseArgs = [
3101
3139
  "run",
3102
3140
  "--format",
3103
3141
  "json",
3104
- "--thinking",
3105
- "--variant",
3106
- "high",
3142
+ ...thinkingArgs,
3107
3143
  "--model",
3108
3144
  `${providerID}/${modelID}`,
3109
3145
  "--dir",
@@ -3294,7 +3330,7 @@ var OpenCodeAdapter = class {
3294
3330
  skills,
3295
3331
  scenario,
3296
3332
  cwd,
3297
- modelConfig,
3333
+ config,
3298
3334
  aiGatewayUrl,
3299
3335
  aiGatewayHeaders,
3300
3336
  traceContext,
@@ -3303,18 +3339,26 @@ var OpenCodeAdapter = class {
3303
3339
  rules,
3304
3340
  systemPrompt
3305
3341
  } = context;
3342
+ const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
3343
+ const cfg = typed?.success ? typed.data : void 0;
3344
+ const rawMaxTurns = cfg?.maxTurns;
3345
+ const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
3306
3346
  const options = {
3307
3347
  cwd,
3308
- model: modelConfig?.model,
3309
- temperature: modelConfig?.temperature,
3310
- maxTurns: modelConfig?.maxTurns,
3348
+ model: cfg?.model,
3349
+ temperature: cfg?.temperature,
3350
+ topP: cfg?.topP,
3351
+ maxTurns,
3352
+ thinkingVariant: cfg?.thinkingVariant,
3353
+ maxDurationMs: cfg?.maxDurationMs,
3311
3354
  aiGatewayUrl,
3312
3355
  aiGatewayHeaders,
3313
3356
  traceContext,
3314
3357
  mcps,
3315
3358
  subAgents,
3316
3359
  rules,
3317
- systemPrompt
3360
+ systemPrompt,
3361
+ config
3318
3362
  };
3319
3363
  const { result, llmTrace, conversation } = await executeWithOpenCode(
3320
3364
  skills,
@@ -3630,20 +3674,30 @@ async function executeWithAiSdk(context) {
3630
3674
  const {
3631
3675
  scenario,
3632
3676
  cwd,
3633
- modelConfig,
3677
+ config,
3634
3678
  aiGatewayUrl,
3635
3679
  aiGatewayHeaders = {},
3636
3680
  mcps,
3637
3681
  traceContext
3638
3682
  } = context;
3683
+ const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
3684
+ const cfg = typed?.success ? typed.data : void 0;
3685
+ const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
3686
+ const configExtras = {};
3687
+ if (config) {
3688
+ for (const [key, value] of Object.entries(config)) {
3689
+ if (!schemaKeys.has(key)) configExtras[key] = value;
3690
+ }
3691
+ }
3639
3692
  if (!aiGatewayUrl) {
3640
3693
  throw new Error("Simple Agent requires aiGatewayUrl");
3641
3694
  }
3642
- if (!modelConfig?.model) {
3643
- throw new Error("Simple Agent requires a model in modelConfig");
3695
+ if (!cfg?.model) {
3696
+ throw new Error("Simple Agent requires a model in config");
3644
3697
  }
3645
- const model = createModel(modelConfig.model, aiGatewayUrl, aiGatewayHeaders);
3646
- const provider = isClaudeModelId(modelConfig.model) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelConfig.model) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
3698
+ const modelId = cfg.model;
3699
+ const model = createModel(modelId, aiGatewayUrl, aiGatewayHeaders);
3700
+ const provider = isClaudeModelId(modelId) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelId) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
3647
3701
  const systemPrompt = composeSystemPrompt(context);
3648
3702
  const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
3649
3703
  const startTime = Date.now();
@@ -3653,15 +3707,17 @@ async function executeWithAiSdk(context) {
3653
3707
  try {
3654
3708
  const isAnthropic = provider === PROVIDER_ANTHROPIC2;
3655
3709
  const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
3656
- (id) => modelConfig.model === id || modelConfig.model.startsWith(id)
3710
+ (id) => modelId === id || modelId.startsWith(id)
3657
3711
  );
3658
3712
  const isGemini = provider === PROVIDER_GEMINI2;
3659
- const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelConfig.model);
3713
+ const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
3660
3714
  const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
3661
- const providerOpts = {
3715
+ const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
3716
+ const reasoningEffort = cfg.reasoningEffort ?? "high";
3717
+ const computedProviderOpts = {
3662
3718
  ...isAnthropic && {
3663
3719
  anthropic: {
3664
- thinking: { type: "enabled", budgetTokens: 1e4 }
3720
+ thinking: { type: "enabled", budgetTokens: thinkingBudgetTokens }
3665
3721
  }
3666
3722
  },
3667
3723
  ...isResponsesAPI && {
@@ -3669,7 +3725,7 @@ async function executeWithAiSdk(context) {
3669
3725
  // Prevent the SDK from sending item_reference inputs that the proxy can't forward
3670
3726
  store: false,
3671
3727
  forceReasoning: true,
3672
- reasoningEffort: "high",
3728
+ reasoningEffort,
3673
3729
  reasoningSummary: "detailed"
3674
3730
  }
3675
3731
  },
@@ -3682,10 +3738,17 @@ async function executeWithAiSdk(context) {
3682
3738
  }
3683
3739
  }
3684
3740
  };
3741
+ const { providerOptions: extraProviderOptions, ...topLevelExtras } = configExtras;
3742
+ const mergedProviderOptions = {
3743
+ ...extraProviderOptions && typeof extraProviderOptions === "object" ? extraProviderOptions : {},
3744
+ ...computedProviderOpts
3745
+ };
3685
3746
  const stepTimestamps = [];
3747
+ const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
3686
3748
  const { triggerPromptImages } = context;
3687
3749
  const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
3688
- const generateTextParams = {
3750
+ const result = await (0, import_ai.generateText)({
3751
+ ...topLevelExtras,
3689
3752
  model,
3690
3753
  system: systemPrompt,
3691
3754
  ...hasImages ? {
@@ -3703,11 +3766,16 @@ async function executeWithAiSdk(context) {
3703
3766
  }
3704
3767
  ]
3705
3768
  } : { prompt: scenario.triggerPrompt },
3706
- temperature: supportsThinking ? void 0 : modelConfig.temperature,
3707
- maxOutputTokens: modelConfig.maxTokens,
3769
+ temperature: supportsThinking ? void 0 : cfg.temperature,
3770
+ topP: supportsThinking ? void 0 : cfg.topP,
3771
+ frequencyPenalty: cfg.frequencyPenalty,
3772
+ presencePenalty: cfg.presencePenalty,
3773
+ seed: cfg.seed,
3774
+ stopSequences: cfg.stopSequences,
3775
+ maxOutputTokens: cfg.maxTokens,
3708
3776
  tools: mcpTools,
3709
- stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
3710
- providerOptions: providerOpts,
3777
+ ...mcpTools && effectiveMaxTurns != null ? { stopWhen: (0, import_ai.stepCountIs)(effectiveMaxTurns) } : !mcpTools ? { stopWhen: (0, import_ai.stepCountIs)(1) } : {},
3778
+ providerOptions: mergedProviderOptions,
3711
3779
  onStepFinish: (step) => {
3712
3780
  stepTimestamps.push(Date.now());
3713
3781
  if (traceContext) {
@@ -3735,8 +3803,7 @@ async function executeWithAiSdk(context) {
3735
3803
  );
3736
3804
  }
3737
3805
  }
3738
- };
3739
- const result = await (0, import_ai.generateText)(generateTextParams);
3806
+ });
3740
3807
  const durationMs = Date.now() - startTime;
3741
3808
  const usage = {
3742
3809
  inputTokens: result.usage.inputTokens ?? 0,
@@ -3747,7 +3814,7 @@ async function executeWithAiSdk(context) {
3747
3814
  result.steps,
3748
3815
  durationMs,
3749
3816
  usage,
3750
- modelConfig.model,
3817
+ modelId,
3751
3818
  provider,
3752
3819
  startTime,
3753
3820
  stepTimestamps
@@ -4660,11 +4727,12 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4660
4727
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
4661
4728
  const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
4662
4729
  const targetName = evalData.presetName || agent?.name || "";
4730
+ const agentConfig = agent?.config;
4663
4731
  const executionContext = {
4664
4732
  skills: evalData.skills,
4665
4733
  scenario,
4666
4734
  cwd: workDir || process.cwd(),
4667
- modelConfig: agent?.modelConfig,
4735
+ config: agentConfig,
4668
4736
  aiGatewayUrl: config.aiGatewayUrl,
4669
4737
  aiGatewayHeaders: config.aiGatewayHeaders,
4670
4738
  traceContext: {
@@ -4699,7 +4767,18 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4699
4767
  infrastructurePaths
4700
4768
  );
4701
4769
  const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
4702
- const resolvedModelConfig = agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4770
+ const snapshotModelConfig = agentConfig?.model ? {
4771
+ model: agentConfig.model,
4772
+ ...agentConfig.temperature != null && {
4773
+ temperature: agentConfig.temperature
4774
+ },
4775
+ ...agentConfig.maxTokens != null && {
4776
+ maxTokens: agentConfig.maxTokens
4777
+ },
4778
+ ...agentConfig.maxTurns != null && {
4779
+ maxTurns: agentConfig.maxTurns
4780
+ }
4781
+ } : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
4703
4782
  return {
4704
4783
  id: (0, import_crypto4.randomUUID)(),
4705
4784
  targetId,
@@ -4707,7 +4786,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4707
4786
  scenarioId: scenario.id,
4708
4787
  scenarioName: scenario.name,
4709
4788
  triggerPrompt: scenario.triggerPrompt,
4710
- modelConfig: resolvedModelConfig,
4789
+ modelConfig: snapshotModelConfig,
4711
4790
  duration: durationMs,
4712
4791
  outputText,
4713
4792
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,