@wix/evalforge-evaluator 0.145.0 → 0.147.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -2225,6 +2225,10 @@ function parseModel(model) {
2225
2225
  const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
2226
2226
  model
2227
2227
  );
2228
+ const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
2229
+ model
2230
+ );
2231
+ if (isGemini) return { providerID: "google", modelID: model };
2228
2232
  return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
2229
2233
  }
2230
2234
  function toOpenCodeMcpConfig(servers) {
@@ -2276,8 +2280,9 @@ async function buildOpenCodeEnv(options) {
2276
2280
  const { providerID, modelID } = parseModel(modelStr);
2277
2281
  const provider = {};
2278
2282
  if (options.aiGatewayUrl) {
2283
+ const proxyPath = providerID === "google" ? "gemini" : providerID;
2279
2284
  const providerOptions = {
2280
- baseURL: `${options.aiGatewayUrl}/proxy/${providerID}`,
2285
+ baseURL: `${options.aiGatewayUrl}/proxy/${proxyPath}`,
2281
2286
  apiKey: "sk-placeholder-auth-handled-by-gateway"
2282
2287
  };
2283
2288
  if (options.aiGatewayHeaders) {
@@ -3338,6 +3343,7 @@ defaultRegistry.register(openCodeAdapter);
3338
3343
  // src/run-scenario/agents/simple-agent/execute.ts
3339
3344
  var import_ai = require("ai");
3340
3345
  var import_anthropic = require("@ai-sdk/anthropic");
3346
+ var import_google = require("@ai-sdk/google");
3341
3347
  var import_openai = require("@ai-sdk/openai");
3342
3348
  var import_evalforge_types11 = require("@wix/evalforge-types");
3343
3349
  var import_crypto3 = require("crypto");
@@ -3360,8 +3366,9 @@ async function buildMcpTools(mcps, cwd) {
3360
3366
  const client = await (0, import_mcp.createMCPClient)({ transport });
3361
3367
  clients.push(client);
3362
3368
  const tools = await client.tools();
3369
+ const safePrefix = serverName.replace(/[^a-zA-Z0-9]/g, "_");
3363
3370
  for (const [toolName, tool] of Object.entries(tools)) {
3364
- allTools[`${serverName}__${toolName}`] = tool;
3371
+ allTools[`${safePrefix}_${toolName}`] = tool;
3365
3372
  }
3366
3373
  }
3367
3374
  }
@@ -3437,6 +3444,7 @@ function extractErrorText(content) {
3437
3444
  // src/run-scenario/agents/simple-agent/cost-calculation.ts
3438
3445
  var import_evalforge_types10 = require("@wix/evalforge-types");
3439
3446
  var PROVIDER_ANTHROPIC = "anthropic";
3447
+ var PROVIDER_GEMINI = "gemini";
3440
3448
  var MODEL_PRICING = {
3441
3449
  // Anthropic — Claude 4.6
3442
3450
  "claude-sonnet-4-6": { input: 3, output: 15 },
@@ -3463,7 +3471,18 @@ var MODEL_PRICING = {
3463
3471
  o3: { input: 2, output: 8 },
3464
3472
  "o4-mini": { input: 1.1, output: 4.4 },
3465
3473
  "o3-mini": { input: 1.1, output: 4.4 },
3466
- o1: { input: 15, output: 60 }
3474
+ o1: { input: 15, output: 60 },
3475
+ // Google Gemini 2.0
3476
+ "gemini-2.0-flash": { input: 0.1, output: 0.4 },
3477
+ "gemini-2.0-flash-lite": { input: 0.075, output: 0.3 },
3478
+ // Google Gemini 2.5
3479
+ "gemini-2.5-pro": { input: 1.25, output: 10 },
3480
+ "gemini-2.5-flash": { input: 0.15, output: 0.6 },
3481
+ "gemini-2.5-flash-lite": { input: 0.075, output: 0.3 },
3482
+ // Google Gemini 3.x — standard pricing up to 200K context tokens
3483
+ "gemini-3-pro-preview": { input: 2, output: 12 },
3484
+ "gemini-3-flash-preview": { input: 0.5, output: 3 },
3485
+ "gemini-3.1-pro-preview": { input: 2, output: 12 }
3467
3486
  };
3468
3487
  function extractGatewayCost(step, provider) {
3469
3488
  try {
@@ -3474,6 +3493,15 @@ function extractGatewayCost(step, provider) {
3474
3493
  const cost2 = usage?.total_cost_usd;
3475
3494
  return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
3476
3495
  }
3496
+ if (provider === PROVIDER_GEMINI) {
3497
+ const meta = step.providerMetadata;
3498
+ const google = meta?.google;
3499
+ const cost2 = google?.total_cost_usd;
3500
+ if (typeof cost2 === "number" && cost2 > 0) return cost2;
3501
+ const body2 = step.response?.body;
3502
+ const bodyCost = body2?.total_cost_usd;
3503
+ return typeof bodyCost === "number" && bodyCost > 0 ? bodyCost : void 0;
3504
+ }
3477
3505
  const body = step.response?.body;
3478
3506
  const cost = body?.total_cost_usd;
3479
3507
  return typeof cost === "number" && cost > 0 ? cost : void 0;
@@ -3551,10 +3579,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
3551
3579
  // src/run-scenario/agents/simple-agent/execute.ts
3552
3580
  var PROVIDER_ANTHROPIC2 = "anthropic";
3553
3581
  var PROVIDER_OPENAI = "openai";
3582
+ var PROVIDER_GEMINI2 = "gemini";
3554
3583
  var DEFAULT_MAX_TOOL_STEPS = 25;
3555
3584
  function createModel(modelId, baseUrl, headers) {
3556
- const isClaudeModel = isClaudeModelId(modelId);
3557
- if (isClaudeModel) {
3585
+ if (isClaudeModelId(modelId)) {
3558
3586
  const anthropic = (0, import_anthropic.createAnthropic)({
3559
3587
  baseURL: `${baseUrl}/proxy/anthropic`,
3560
3588
  apiKey: "proxy-auth",
@@ -3562,6 +3590,14 @@ function createModel(modelId, baseUrl, headers) {
3562
3590
  });
3563
3591
  return anthropic(modelId);
3564
3592
  }
3593
+ if (isGeminiModelId(modelId)) {
3594
+ const google = (0, import_google.createGoogleGenerativeAI)({
3595
+ baseURL: `${baseUrl}/proxy/gemini`,
3596
+ apiKey: "proxy-auth",
3597
+ headers
3598
+ });
3599
+ return google(modelId);
3600
+ }
3565
3601
  const openai = (0, import_openai.createOpenAI)({
3566
3602
  baseURL: `${baseUrl}/proxy/openai`,
3567
3603
  apiKey: "proxy-auth",
@@ -3579,6 +3615,11 @@ function isClaudeModelId(modelId) {
3579
3615
  (id) => modelId === id || modelId.startsWith(id)
3580
3616
  );
3581
3617
  }
3618
+ function isGeminiModelId(modelId) {
3619
+ return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
3620
+ (id) => modelId === id || modelId.startsWith(id)
3621
+ );
3622
+ }
3582
3623
  function extractSkillContent(files) {
3583
3624
  if (!files || files.length === 0) return void 0;
3584
3625
  const skillMd = files.find((f) => f.path === "SKILL.md");
@@ -3602,7 +3643,7 @@ async function executeWithAiSdk(context) {
3602
3643
  throw new Error("Simple Agent requires a model in modelConfig");
3603
3644
  }
3604
3645
  const model = createModel(modelConfig.model, aiGatewayUrl, aiGatewayHeaders);
3605
- const provider = isClaudeModelId(modelConfig.model) ? PROVIDER_ANTHROPIC2 : PROVIDER_OPENAI;
3646
+ const provider = isClaudeModelId(modelConfig.model) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelConfig.model) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
3606
3647
  const systemPrompt = composeSystemPrompt(context);
3607
3648
  const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
3608
3649
  const startTime = Date.now();
@@ -3614,7 +3655,9 @@ async function executeWithAiSdk(context) {
3614
3655
  const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
3615
3656
  (id) => modelConfig.model === id || modelConfig.model.startsWith(id)
3616
3657
  );
3617
- const supportsThinking = isAnthropic || isResponsesAPI;
3658
+ const isGemini = provider === PROVIDER_GEMINI2;
3659
+ const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelConfig.model);
3660
+ const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
3618
3661
  const providerOpts = {
3619
3662
  ...isAnthropic && {
3620
3663
  anthropic: {
@@ -3629,6 +3672,14 @@ async function executeWithAiSdk(context) {
3629
3672
  reasoningEffort: "high",
3630
3673
  reasoningSummary: "detailed"
3631
3674
  }
3675
+ },
3676
+ ...isGeminiThinking && {
3677
+ google: {
3678
+ thinkingConfig: {
3679
+ includeThoughts: true,
3680
+ thinkingBudget: 1e4
3681
+ }
3682
+ }
3632
3683
  }
3633
3684
  };
3634
3685
  const stepTimestamps = [];