@wix/evalforge-evaluator 0.146.0 → 0.147.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +58 -7
- package/build/index.js.map +3 -3
- package/build/index.mjs +62 -8
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/simple-agent/execute.d.ts +1 -0
- package/build/types/run-scenario/agents/simple-agent/mcp-tools.d.ts +0 -2
- package/package.json +8 -7
package/build/index.mjs
CHANGED
|
@@ -2216,7 +2216,8 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = fetchGitHub
|
|
|
2216
2216
|
import { homedir as homedir2 } from "os";
|
|
2217
2217
|
import {
|
|
2218
2218
|
ClaudeModel as ClaudeModel2,
|
|
2219
|
-
AVAILABLE_OPENAI_MODEL_IDS
|
|
2219
|
+
AVAILABLE_OPENAI_MODEL_IDS,
|
|
2220
|
+
AVAILABLE_GEMINI_MODEL_IDS
|
|
2220
2221
|
} from "@wix/evalforge-types";
|
|
2221
2222
|
var DEFAULT_MODEL2 = `${ClaudeModel2.CLAUDE_4_5_SONNET_1_0}`;
|
|
2222
2223
|
function parseModel(model) {
|
|
@@ -2230,6 +2231,10 @@ function parseModel(model) {
|
|
|
2230
2231
|
const isOpenAI = AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2231
2232
|
model
|
|
2232
2233
|
);
|
|
2234
|
+
const isGemini = AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
2235
|
+
model
|
|
2236
|
+
);
|
|
2237
|
+
if (isGemini) return { providerID: "google", modelID: model };
|
|
2233
2238
|
return { providerID: isOpenAI ? "openai" : "anthropic", modelID: model };
|
|
2234
2239
|
}
|
|
2235
2240
|
function toOpenCodeMcpConfig(servers) {
|
|
@@ -2281,8 +2286,9 @@ async function buildOpenCodeEnv(options) {
|
|
|
2281
2286
|
const { providerID, modelID } = parseModel(modelStr);
|
|
2282
2287
|
const provider = {};
|
|
2283
2288
|
if (options.aiGatewayUrl) {
|
|
2289
|
+
const proxyPath = providerID === "google" ? "gemini" : providerID;
|
|
2284
2290
|
const providerOptions = {
|
|
2285
|
-
baseURL: `${options.aiGatewayUrl}/proxy/${
|
|
2291
|
+
baseURL: `${options.aiGatewayUrl}/proxy/${proxyPath}`,
|
|
2286
2292
|
apiKey: "sk-placeholder-auth-handled-by-gateway"
|
|
2287
2293
|
};
|
|
2288
2294
|
if (options.aiGatewayHeaders) {
|
|
@@ -3346,9 +3352,12 @@ import {
|
|
|
3346
3352
|
stepCountIs
|
|
3347
3353
|
} from "ai";
|
|
3348
3354
|
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
3355
|
+
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
|
3349
3356
|
import { createOpenAI } from "@ai-sdk/openai";
|
|
3350
3357
|
import {
|
|
3351
3358
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
3359
|
+
AVAILABLE_GEMINI_MODEL_IDS as AVAILABLE_GEMINI_MODEL_IDS2,
|
|
3360
|
+
GEMINI_THINKING_MODEL_IDS,
|
|
3352
3361
|
OPENAI_RESPONSES_MODEL_IDS,
|
|
3353
3362
|
LLMStepType as LLMStepType3,
|
|
3354
3363
|
LiveTraceEventType as LiveTraceEventType3
|
|
@@ -3373,8 +3382,9 @@ async function buildMcpTools(mcps, cwd) {
|
|
|
3373
3382
|
const client = await createMCPClient({ transport });
|
|
3374
3383
|
clients.push(client);
|
|
3375
3384
|
const tools = await client.tools();
|
|
3385
|
+
const safePrefix = serverName.replace(/[^a-zA-Z0-9]/g, "_");
|
|
3376
3386
|
for (const [toolName, tool] of Object.entries(tools)) {
|
|
3377
|
-
allTools[`${
|
|
3387
|
+
allTools[`${safePrefix}_${toolName}`] = tool;
|
|
3378
3388
|
}
|
|
3379
3389
|
}
|
|
3380
3390
|
}
|
|
@@ -3450,6 +3460,7 @@ function extractErrorText(content) {
|
|
|
3450
3460
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
3451
3461
|
import { normalizeModelId } from "@wix/evalforge-types";
|
|
3452
3462
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
3463
|
+
var PROVIDER_GEMINI = "gemini";
|
|
3453
3464
|
var MODEL_PRICING = {
|
|
3454
3465
|
// Anthropic — Claude 4.6
|
|
3455
3466
|
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
@@ -3476,7 +3487,18 @@ var MODEL_PRICING = {
|
|
|
3476
3487
|
o3: { input: 2, output: 8 },
|
|
3477
3488
|
"o4-mini": { input: 1.1, output: 4.4 },
|
|
3478
3489
|
"o3-mini": { input: 1.1, output: 4.4 },
|
|
3479
|
-
o1: { input: 15, output: 60 }
|
|
3490
|
+
o1: { input: 15, output: 60 },
|
|
3491
|
+
// Google Gemini 2.0
|
|
3492
|
+
"gemini-2.0-flash": { input: 0.1, output: 0.4 },
|
|
3493
|
+
"gemini-2.0-flash-lite": { input: 0.075, output: 0.3 },
|
|
3494
|
+
// Google Gemini 2.5
|
|
3495
|
+
"gemini-2.5-pro": { input: 1.25, output: 10 },
|
|
3496
|
+
"gemini-2.5-flash": { input: 0.15, output: 0.6 },
|
|
3497
|
+
"gemini-2.5-flash-lite": { input: 0.075, output: 0.3 },
|
|
3498
|
+
// Google Gemini 3.x — standard pricing up to 200K context tokens
|
|
3499
|
+
"gemini-3-pro-preview": { input: 2, output: 12 },
|
|
3500
|
+
"gemini-3-flash-preview": { input: 0.5, output: 3 },
|
|
3501
|
+
"gemini-3.1-pro-preview": { input: 2, output: 12 }
|
|
3480
3502
|
};
|
|
3481
3503
|
function extractGatewayCost(step, provider) {
|
|
3482
3504
|
try {
|
|
@@ -3487,6 +3509,15 @@ function extractGatewayCost(step, provider) {
|
|
|
3487
3509
|
const cost2 = usage?.total_cost_usd;
|
|
3488
3510
|
return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
|
|
3489
3511
|
}
|
|
3512
|
+
if (provider === PROVIDER_GEMINI) {
|
|
3513
|
+
const meta = step.providerMetadata;
|
|
3514
|
+
const google = meta?.google;
|
|
3515
|
+
const cost2 = google?.total_cost_usd;
|
|
3516
|
+
if (typeof cost2 === "number" && cost2 > 0) return cost2;
|
|
3517
|
+
const body2 = step.response?.body;
|
|
3518
|
+
const bodyCost = body2?.total_cost_usd;
|
|
3519
|
+
return typeof bodyCost === "number" && bodyCost > 0 ? bodyCost : void 0;
|
|
3520
|
+
}
|
|
3490
3521
|
const body = step.response?.body;
|
|
3491
3522
|
const cost = body?.total_cost_usd;
|
|
3492
3523
|
return typeof cost === "number" && cost > 0 ? cost : void 0;
|
|
@@ -3564,10 +3595,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3564
3595
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3565
3596
|
var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
3566
3597
|
var PROVIDER_OPENAI = "openai";
|
|
3598
|
+
var PROVIDER_GEMINI2 = "gemini";
|
|
3567
3599
|
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
3568
3600
|
function createModel(modelId, baseUrl, headers) {
|
|
3569
|
-
|
|
3570
|
-
if (isClaudeModel) {
|
|
3601
|
+
if (isClaudeModelId(modelId)) {
|
|
3571
3602
|
const anthropic = createAnthropic({
|
|
3572
3603
|
baseURL: `${baseUrl}/proxy/anthropic`,
|
|
3573
3604
|
apiKey: "proxy-auth",
|
|
@@ -3575,6 +3606,14 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
3575
3606
|
});
|
|
3576
3607
|
return anthropic(modelId);
|
|
3577
3608
|
}
|
|
3609
|
+
if (isGeminiModelId(modelId)) {
|
|
3610
|
+
const google = createGoogleGenerativeAI({
|
|
3611
|
+
baseURL: `${baseUrl}/proxy/gemini`,
|
|
3612
|
+
apiKey: "proxy-auth",
|
|
3613
|
+
headers
|
|
3614
|
+
});
|
|
3615
|
+
return google(modelId);
|
|
3616
|
+
}
|
|
3578
3617
|
const openai = createOpenAI({
|
|
3579
3618
|
baseURL: `${baseUrl}/proxy/openai`,
|
|
3580
3619
|
apiKey: "proxy-auth",
|
|
@@ -3592,6 +3631,11 @@ function isClaudeModelId(modelId) {
|
|
|
3592
3631
|
(id) => modelId === id || modelId.startsWith(id)
|
|
3593
3632
|
);
|
|
3594
3633
|
}
|
|
3634
|
+
function isGeminiModelId(modelId) {
|
|
3635
|
+
return AVAILABLE_GEMINI_MODEL_IDS2.some(
|
|
3636
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3637
|
+
);
|
|
3638
|
+
}
|
|
3595
3639
|
function extractSkillContent(files) {
|
|
3596
3640
|
if (!files || files.length === 0) return void 0;
|
|
3597
3641
|
const skillMd = files.find((f) => f.path === "SKILL.md");
|
|
@@ -3615,7 +3659,7 @@ async function executeWithAiSdk(context) {
|
|
|
3615
3659
|
throw new Error("Simple Agent requires a model in modelConfig");
|
|
3616
3660
|
}
|
|
3617
3661
|
const model = createModel(modelConfig.model, aiGatewayUrl, aiGatewayHeaders);
|
|
3618
|
-
const provider = isClaudeModelId(modelConfig.model) ? PROVIDER_ANTHROPIC2 : PROVIDER_OPENAI;
|
|
3662
|
+
const provider = isClaudeModelId(modelConfig.model) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelConfig.model) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
|
|
3619
3663
|
const systemPrompt = composeSystemPrompt(context);
|
|
3620
3664
|
const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
|
|
3621
3665
|
const startTime = Date.now();
|
|
@@ -3627,7 +3671,9 @@ async function executeWithAiSdk(context) {
|
|
|
3627
3671
|
const isResponsesAPI = [...OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3628
3672
|
(id) => modelConfig.model === id || modelConfig.model.startsWith(id)
|
|
3629
3673
|
);
|
|
3630
|
-
const
|
|
3674
|
+
const isGemini = provider === PROVIDER_GEMINI2;
|
|
3675
|
+
const isGeminiThinking = isGemini && GEMINI_THINKING_MODEL_IDS.has(modelConfig.model);
|
|
3676
|
+
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
3631
3677
|
const providerOpts = {
|
|
3632
3678
|
...isAnthropic && {
|
|
3633
3679
|
anthropic: {
|
|
@@ -3642,6 +3688,14 @@ async function executeWithAiSdk(context) {
|
|
|
3642
3688
|
reasoningEffort: "high",
|
|
3643
3689
|
reasoningSummary: "detailed"
|
|
3644
3690
|
}
|
|
3691
|
+
},
|
|
3692
|
+
...isGeminiThinking && {
|
|
3693
|
+
google: {
|
|
3694
|
+
thinkingConfig: {
|
|
3695
|
+
includeThoughts: true,
|
|
3696
|
+
thinkingBudget: 1e4
|
|
3697
|
+
}
|
|
3698
|
+
}
|
|
3645
3699
|
}
|
|
3646
3700
|
};
|
|
3647
3701
|
const stepTimestamps = [];
|