@wix/evalforge-evaluator 0.146.0 → 0.148.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +192 -62
- package/build/index.js.map +3 -3
- package/build/index.mjs +201 -66
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/types.d.ts +15 -3
- package/build/types/run-scenario/agents/opencode/config.d.ts +3 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +8 -0
- package/build/types/run-scenario/agents/simple-agent/execute.d.ts +1 -0
- package/build/types/run-scenario/agents/simple-agent/mcp-tools.d.ts +0 -2
- package/package.json +8 -7
package/build/index.js
CHANGED
|
@@ -330,10 +330,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
330
330
|
if (evalRun.agentId) {
|
|
331
331
|
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
332
332
|
}
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
333
|
+
const skills = [];
|
|
334
|
+
const mcps = [];
|
|
335
|
+
const subAgents = [];
|
|
336
|
+
const rules = [];
|
|
337
337
|
if (evalRun.capabilityIds && evalRun.capabilityIds.length > 0) {
|
|
338
338
|
const fetchResults = await Promise.allSettled(
|
|
339
339
|
evalRun.capabilityIds.map((id) => api.getCapability(projectId2, id))
|
|
@@ -1190,10 +1190,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1190
1190
|
let lastAction = "Starting...";
|
|
1191
1191
|
let lastToolName;
|
|
1192
1192
|
let lastFilePath;
|
|
1193
|
-
const maxTurns = options.maxTurns
|
|
1193
|
+
const maxTurns = options.maxTurns || void 0;
|
|
1194
1194
|
let messageCount = 0;
|
|
1195
1195
|
const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
|
|
1196
|
-
const baseAllowedTools = [
|
|
1196
|
+
const baseAllowedTools = options.allowedTools ?? [
|
|
1197
1197
|
"Skill",
|
|
1198
1198
|
"Read",
|
|
1199
1199
|
"Write",
|
|
@@ -1207,13 +1207,17 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1207
1207
|
];
|
|
1208
1208
|
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
1209
1209
|
const queryOptions = {
|
|
1210
|
+
...options.extras ?? {},
|
|
1210
1211
|
env: sdkEnv,
|
|
1211
1212
|
cwd: options.cwd,
|
|
1212
1213
|
settingSources: ["project"],
|
|
1213
1214
|
allowedTools,
|
|
1215
|
+
...options.disallowedTools?.length ? { disallowedTools: options.disallowedTools } : {},
|
|
1214
1216
|
model: options.model || DEFAULT_MODEL,
|
|
1215
1217
|
maxTurns,
|
|
1216
1218
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
1219
|
+
...options.effort ? { effort: options.effort } : {},
|
|
1220
|
+
...options.maxBudgetUsd != null ? { maxBudgetUsd: options.maxBudgetUsd } : {},
|
|
1217
1221
|
// Use 'default' permission mode with custom canUseTool handler
|
|
1218
1222
|
// instead of 'bypassPermissions' which fails on root
|
|
1219
1223
|
permissionMode: "default",
|
|
@@ -1288,7 +1292,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1288
1292
|
traceContext.authToken
|
|
1289
1293
|
);
|
|
1290
1294
|
}
|
|
1291
|
-
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
1295
|
+
const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
1292
1296
|
let timeoutHandle;
|
|
1293
1297
|
let timedOut = false;
|
|
1294
1298
|
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
@@ -2052,7 +2056,7 @@ var ClaudeCodeAdapter = class {
|
|
|
2052
2056
|
skills,
|
|
2053
2057
|
scenario,
|
|
2054
2058
|
cwd,
|
|
2055
|
-
|
|
2059
|
+
config,
|
|
2056
2060
|
aiGatewayUrl,
|
|
2057
2061
|
aiGatewayHeaders,
|
|
2058
2062
|
traceContext,
|
|
@@ -2061,20 +2065,37 @@ var ClaudeCodeAdapter = class {
|
|
|
2061
2065
|
rules,
|
|
2062
2066
|
systemPrompt
|
|
2063
2067
|
} = context;
|
|
2064
|
-
const
|
|
2068
|
+
const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
2069
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
2070
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
|
|
2071
|
+
const extras = {};
|
|
2072
|
+
if (config) {
|
|
2073
|
+
for (const [key, value] of Object.entries(config)) {
|
|
2074
|
+
if (!schemaKeys.has(key)) extras[key] = value;
|
|
2075
|
+
}
|
|
2076
|
+
}
|
|
2077
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
2078
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
2065
2079
|
const options = {
|
|
2066
2080
|
cwd,
|
|
2067
|
-
model:
|
|
2068
|
-
temperature:
|
|
2069
|
-
maxTokens:
|
|
2070
|
-
maxTurns
|
|
2081
|
+
model: cfg?.model,
|
|
2082
|
+
temperature: cfg?.temperature,
|
|
2083
|
+
maxTokens: cfg?.maxTokens,
|
|
2084
|
+
maxTurns,
|
|
2085
|
+
maxThinkingTokens: cfg?.maxThinkingTokens,
|
|
2086
|
+
allowedTools: cfg?.allowedTools,
|
|
2087
|
+
disallowedTools: cfg?.disallowedTools,
|
|
2088
|
+
effort: cfg?.effort,
|
|
2089
|
+
maxBudgetUsd: cfg?.maxBudgetUsd,
|
|
2090
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
2071
2091
|
aiGatewayUrl,
|
|
2072
2092
|
aiGatewayHeaders,
|
|
2073
2093
|
traceContext,
|
|
2074
2094
|
mcps,
|
|
2075
2095
|
subAgents,
|
|
2076
2096
|
rules,
|
|
2077
|
-
systemPrompt
|
|
2097
|
+
systemPrompt,
|
|
2098
|
+
extras
|
|
2078
2099
|
};
|
|
2079
2100
|
const { result, llmTrace, conversation } = await executeWithClaudeCode(
|
|
2080
2101
|
skills,
|
|
@@ -2214,18 +2235,29 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
|
|
|
2214
2235
|
var import_os3 = require("os");
|
|
2215
2236
|
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
2216
2237
|
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2238
|
+
var OPENCODE_MODEL_ALIASES = {
|
|
2239
|
+
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
2240
|
+
"claude-opus-4": "claude-opus-4-0"
|
|
2241
|
+
};
|
|
2217
2242
|
function parseModel(model) {
|
|
2218
2243
|
const slashIndex = model.indexOf("/");
|
|
2219
2244
|
if (slashIndex > 0) {
|
|
2245
|
+
const providerID = model.slice(0, slashIndex);
|
|
2246
|
+
const rawModelID = model.slice(slashIndex + 1);
|
|
2220
2247
|
return {
|
|
2221
|
-
providerID
|
|
2222
|
-
modelID:
|
|
2248
|
+
providerID,
|
|
2249
|
+
modelID: OPENCODE_MODEL_ALIASES[rawModelID] ?? rawModelID
|
|
2223
2250
|
};
|
|
2224
2251
|
}
|
|
2252
|
+
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
2225
2253
|
const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2226
2254
|
model
|
|
2227
2255
|
);
|
|
2228
|
-
|
|
2256
|
+
const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
2257
|
+
model
|
|
2258
|
+
);
|
|
2259
|
+
if (isGemini) return { providerID: "google", modelID };
|
|
2260
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID };
|
|
2229
2261
|
}
|
|
2230
2262
|
function toOpenCodeMcpConfig(servers) {
|
|
2231
2263
|
const result = {};
|
|
@@ -2276,8 +2308,9 @@ async function buildOpenCodeEnv(options) {
|
|
|
2276
2308
|
const { providerID, modelID } = parseModel(modelStr);
|
|
2277
2309
|
const provider = {};
|
|
2278
2310
|
if (options.aiGatewayUrl) {
|
|
2311
|
+
const proxyPath = providerID === "google" ? "gemini" : providerID;
|
|
2279
2312
|
const providerOptions = {
|
|
2280
|
-
baseURL: `${options.aiGatewayUrl}/proxy/${
|
|
2313
|
+
baseURL: `${options.aiGatewayUrl}/proxy/${proxyPath}`,
|
|
2281
2314
|
apiKey: "sk-placeholder-auth-handled-by-gateway"
|
|
2282
2315
|
};
|
|
2283
2316
|
if (options.aiGatewayHeaders) {
|
|
@@ -2308,20 +2341,26 @@ async function buildOpenCodeEnv(options) {
|
|
|
2308
2341
|
if (options.temperature != null) {
|
|
2309
2342
|
agentOverrides.temperature = options.temperature;
|
|
2310
2343
|
}
|
|
2311
|
-
if (options.
|
|
2344
|
+
if (options.topP != null) {
|
|
2345
|
+
agentOverrides.top_p = options.topP;
|
|
2346
|
+
}
|
|
2347
|
+
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
2312
2348
|
agentOverrides.maxSteps = options.maxTurns;
|
|
2313
2349
|
}
|
|
2350
|
+
const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
2351
|
+
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
2352
|
+
const defaultPermission = {
|
|
2353
|
+
"*": "allow"
|
|
2354
|
+
};
|
|
2355
|
+
const permission = {
|
|
2356
|
+
...defaultPermission,
|
|
2357
|
+
...configPermission
|
|
2358
|
+
};
|
|
2314
2359
|
const config = {
|
|
2315
2360
|
model: `${providerID}/${modelID}`,
|
|
2316
2361
|
provider,
|
|
2317
2362
|
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2318
|
-
permission
|
|
2319
|
-
edit: "allow",
|
|
2320
|
-
bash: "allow",
|
|
2321
|
-
webfetch: "allow",
|
|
2322
|
-
doom_loop: "allow",
|
|
2323
|
-
external_directory: "allow"
|
|
2324
|
-
},
|
|
2363
|
+
permission,
|
|
2325
2364
|
...mcp ? { mcp } : {}
|
|
2326
2365
|
};
|
|
2327
2366
|
const env = {
|
|
@@ -3055,16 +3094,18 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3055
3094
|
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
3056
3095
|
model: options.model
|
|
3057
3096
|
});
|
|
3058
|
-
const maxTurns = options.maxTurns
|
|
3059
|
-
const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
|
|
3097
|
+
const maxTurns = options.maxTurns || void 0;
|
|
3098
|
+
const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
3060
3099
|
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3061
3100
|
model: options.model,
|
|
3062
3101
|
temperature: options.temperature,
|
|
3102
|
+
topP: options.topP,
|
|
3063
3103
|
maxTurns,
|
|
3064
3104
|
aiGatewayUrl: options.aiGatewayUrl,
|
|
3065
3105
|
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
3066
3106
|
mcps: options.mcps,
|
|
3067
|
-
cwd: options.cwd
|
|
3107
|
+
cwd: options.cwd,
|
|
3108
|
+
config: options.config
|
|
3068
3109
|
});
|
|
3069
3110
|
const startTime = /* @__PURE__ */ new Date();
|
|
3070
3111
|
const traceContext = options.traceContext;
|
|
@@ -3092,13 +3133,13 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3092
3133
|
traceContext.authToken
|
|
3093
3134
|
);
|
|
3094
3135
|
}
|
|
3136
|
+
const variant = options.thinkingVariant ?? "high";
|
|
3137
|
+
const thinkingArgs = variant === "none" ? [] : ["--thinking", "--variant", variant];
|
|
3095
3138
|
const baseArgs = [
|
|
3096
3139
|
"run",
|
|
3097
3140
|
"--format",
|
|
3098
3141
|
"json",
|
|
3099
|
-
|
|
3100
|
-
"--variant",
|
|
3101
|
-
"high",
|
|
3142
|
+
...thinkingArgs,
|
|
3102
3143
|
"--model",
|
|
3103
3144
|
`${providerID}/${modelID}`,
|
|
3104
3145
|
"--dir",
|
|
@@ -3289,7 +3330,7 @@ var OpenCodeAdapter = class {
|
|
|
3289
3330
|
skills,
|
|
3290
3331
|
scenario,
|
|
3291
3332
|
cwd,
|
|
3292
|
-
|
|
3333
|
+
config,
|
|
3293
3334
|
aiGatewayUrl,
|
|
3294
3335
|
aiGatewayHeaders,
|
|
3295
3336
|
traceContext,
|
|
@@ -3298,18 +3339,26 @@ var OpenCodeAdapter = class {
|
|
|
3298
3339
|
rules,
|
|
3299
3340
|
systemPrompt
|
|
3300
3341
|
} = context;
|
|
3342
|
+
const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
3343
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3344
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
3345
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
3301
3346
|
const options = {
|
|
3302
3347
|
cwd,
|
|
3303
|
-
model:
|
|
3304
|
-
temperature:
|
|
3305
|
-
|
|
3348
|
+
model: cfg?.model,
|
|
3349
|
+
temperature: cfg?.temperature,
|
|
3350
|
+
topP: cfg?.topP,
|
|
3351
|
+
maxTurns,
|
|
3352
|
+
thinkingVariant: cfg?.thinkingVariant,
|
|
3353
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
3306
3354
|
aiGatewayUrl,
|
|
3307
3355
|
aiGatewayHeaders,
|
|
3308
3356
|
traceContext,
|
|
3309
3357
|
mcps,
|
|
3310
3358
|
subAgents,
|
|
3311
3359
|
rules,
|
|
3312
|
-
systemPrompt
|
|
3360
|
+
systemPrompt,
|
|
3361
|
+
config
|
|
3313
3362
|
};
|
|
3314
3363
|
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3315
3364
|
skills,
|
|
@@ -3338,6 +3387,7 @@ defaultRegistry.register(openCodeAdapter);
|
|
|
3338
3387
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3339
3388
|
var import_ai = require("ai");
|
|
3340
3389
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
3390
|
+
var import_google = require("@ai-sdk/google");
|
|
3341
3391
|
var import_openai = require("@ai-sdk/openai");
|
|
3342
3392
|
var import_evalforge_types11 = require("@wix/evalforge-types");
|
|
3343
3393
|
var import_crypto3 = require("crypto");
|
|
@@ -3360,8 +3410,9 @@ async function buildMcpTools(mcps, cwd) {
|
|
|
3360
3410
|
const client = await (0, import_mcp.createMCPClient)({ transport });
|
|
3361
3411
|
clients.push(client);
|
|
3362
3412
|
const tools = await client.tools();
|
|
3413
|
+
const safePrefix = serverName.replace(/[^a-zA-Z0-9]/g, "_");
|
|
3363
3414
|
for (const [toolName, tool] of Object.entries(tools)) {
|
|
3364
|
-
allTools[`${
|
|
3415
|
+
allTools[`${safePrefix}_${toolName}`] = tool;
|
|
3365
3416
|
}
|
|
3366
3417
|
}
|
|
3367
3418
|
}
|
|
@@ -3437,6 +3488,7 @@ function extractErrorText(content) {
|
|
|
3437
3488
|
// src/run-scenario/agents/simple-agent/cost-calculation.ts
|
|
3438
3489
|
var import_evalforge_types10 = require("@wix/evalforge-types");
|
|
3439
3490
|
var PROVIDER_ANTHROPIC = "anthropic";
|
|
3491
|
+
var PROVIDER_GEMINI = "gemini";
|
|
3440
3492
|
var MODEL_PRICING = {
|
|
3441
3493
|
// Anthropic — Claude 4.6
|
|
3442
3494
|
"claude-sonnet-4-6": { input: 3, output: 15 },
|
|
@@ -3463,7 +3515,18 @@ var MODEL_PRICING = {
|
|
|
3463
3515
|
o3: { input: 2, output: 8 },
|
|
3464
3516
|
"o4-mini": { input: 1.1, output: 4.4 },
|
|
3465
3517
|
"o3-mini": { input: 1.1, output: 4.4 },
|
|
3466
|
-
o1: { input: 15, output: 60 }
|
|
3518
|
+
o1: { input: 15, output: 60 },
|
|
3519
|
+
// Google Gemini 2.0
|
|
3520
|
+
"gemini-2.0-flash": { input: 0.1, output: 0.4 },
|
|
3521
|
+
"gemini-2.0-flash-lite": { input: 0.075, output: 0.3 },
|
|
3522
|
+
// Google Gemini 2.5
|
|
3523
|
+
"gemini-2.5-pro": { input: 1.25, output: 10 },
|
|
3524
|
+
"gemini-2.5-flash": { input: 0.15, output: 0.6 },
|
|
3525
|
+
"gemini-2.5-flash-lite": { input: 0.075, output: 0.3 },
|
|
3526
|
+
// Google Gemini 3.x — standard pricing up to 200K context tokens
|
|
3527
|
+
"gemini-3-pro-preview": { input: 2, output: 12 },
|
|
3528
|
+
"gemini-3-flash-preview": { input: 0.5, output: 3 },
|
|
3529
|
+
"gemini-3.1-pro-preview": { input: 2, output: 12 }
|
|
3467
3530
|
};
|
|
3468
3531
|
function extractGatewayCost(step, provider) {
|
|
3469
3532
|
try {
|
|
@@ -3474,6 +3537,15 @@ function extractGatewayCost(step, provider) {
|
|
|
3474
3537
|
const cost2 = usage?.total_cost_usd;
|
|
3475
3538
|
return typeof cost2 === "number" && cost2 > 0 ? cost2 : void 0;
|
|
3476
3539
|
}
|
|
3540
|
+
if (provider === PROVIDER_GEMINI) {
|
|
3541
|
+
const meta = step.providerMetadata;
|
|
3542
|
+
const google = meta?.google;
|
|
3543
|
+
const cost2 = google?.total_cost_usd;
|
|
3544
|
+
if (typeof cost2 === "number" && cost2 > 0) return cost2;
|
|
3545
|
+
const body2 = step.response?.body;
|
|
3546
|
+
const bodyCost = body2?.total_cost_usd;
|
|
3547
|
+
return typeof bodyCost === "number" && bodyCost > 0 ? bodyCost : void 0;
|
|
3548
|
+
}
|
|
3477
3549
|
const body = step.response?.body;
|
|
3478
3550
|
const cost = body?.total_cost_usd;
|
|
3479
3551
|
return typeof cost === "number" && cost > 0 ? cost : void 0;
|
|
@@ -3551,10 +3623,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
|
|
|
3551
3623
|
// src/run-scenario/agents/simple-agent/execute.ts
|
|
3552
3624
|
var PROVIDER_ANTHROPIC2 = "anthropic";
|
|
3553
3625
|
var PROVIDER_OPENAI = "openai";
|
|
3626
|
+
var PROVIDER_GEMINI2 = "gemini";
|
|
3554
3627
|
var DEFAULT_MAX_TOOL_STEPS = 25;
|
|
3555
3628
|
function createModel(modelId, baseUrl, headers) {
|
|
3556
|
-
|
|
3557
|
-
if (isClaudeModel) {
|
|
3629
|
+
if (isClaudeModelId(modelId)) {
|
|
3558
3630
|
const anthropic = (0, import_anthropic.createAnthropic)({
|
|
3559
3631
|
baseURL: `${baseUrl}/proxy/anthropic`,
|
|
3560
3632
|
apiKey: "proxy-auth",
|
|
@@ -3562,6 +3634,14 @@ function createModel(modelId, baseUrl, headers) {
|
|
|
3562
3634
|
});
|
|
3563
3635
|
return anthropic(modelId);
|
|
3564
3636
|
}
|
|
3637
|
+
if (isGeminiModelId(modelId)) {
|
|
3638
|
+
const google = (0, import_google.createGoogleGenerativeAI)({
|
|
3639
|
+
baseURL: `${baseUrl}/proxy/gemini`,
|
|
3640
|
+
apiKey: "proxy-auth",
|
|
3641
|
+
headers
|
|
3642
|
+
});
|
|
3643
|
+
return google(modelId);
|
|
3644
|
+
}
|
|
3565
3645
|
const openai = (0, import_openai.createOpenAI)({
|
|
3566
3646
|
baseURL: `${baseUrl}/proxy/openai`,
|
|
3567
3647
|
apiKey: "proxy-auth",
|
|
@@ -3579,6 +3659,11 @@ function isClaudeModelId(modelId) {
|
|
|
3579
3659
|
(id) => modelId === id || modelId.startsWith(id)
|
|
3580
3660
|
);
|
|
3581
3661
|
}
|
|
3662
|
+
function isGeminiModelId(modelId) {
|
|
3663
|
+
return import_evalforge_types11.AVAILABLE_GEMINI_MODEL_IDS.some(
|
|
3664
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3665
|
+
);
|
|
3666
|
+
}
|
|
3582
3667
|
function extractSkillContent(files) {
|
|
3583
3668
|
if (!files || files.length === 0) return void 0;
|
|
3584
3669
|
const skillMd = files.find((f) => f.path === "SKILL.md");
|
|
@@ -3589,20 +3674,30 @@ async function executeWithAiSdk(context) {
|
|
|
3589
3674
|
const {
|
|
3590
3675
|
scenario,
|
|
3591
3676
|
cwd,
|
|
3592
|
-
|
|
3677
|
+
config,
|
|
3593
3678
|
aiGatewayUrl,
|
|
3594
3679
|
aiGatewayHeaders = {},
|
|
3595
3680
|
mcps,
|
|
3596
3681
|
traceContext
|
|
3597
3682
|
} = context;
|
|
3683
|
+
const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
3684
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3685
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
|
|
3686
|
+
const configExtras = {};
|
|
3687
|
+
if (config) {
|
|
3688
|
+
for (const [key, value] of Object.entries(config)) {
|
|
3689
|
+
if (!schemaKeys.has(key)) configExtras[key] = value;
|
|
3690
|
+
}
|
|
3691
|
+
}
|
|
3598
3692
|
if (!aiGatewayUrl) {
|
|
3599
3693
|
throw new Error("Simple Agent requires aiGatewayUrl");
|
|
3600
3694
|
}
|
|
3601
|
-
if (!
|
|
3602
|
-
throw new Error("Simple Agent requires a model in
|
|
3695
|
+
if (!cfg?.model) {
|
|
3696
|
+
throw new Error("Simple Agent requires a model in config");
|
|
3603
3697
|
}
|
|
3604
|
-
const
|
|
3605
|
-
const
|
|
3698
|
+
const modelId = cfg.model;
|
|
3699
|
+
const model = createModel(modelId, aiGatewayUrl, aiGatewayHeaders);
|
|
3700
|
+
const provider = isClaudeModelId(modelId) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelId) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
|
|
3606
3701
|
const systemPrompt = composeSystemPrompt(context);
|
|
3607
3702
|
const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
|
|
3608
3703
|
const startTime = Date.now();
|
|
@@ -3612,13 +3707,17 @@ async function executeWithAiSdk(context) {
|
|
|
3612
3707
|
try {
|
|
3613
3708
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
3614
3709
|
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3615
|
-
(id) =>
|
|
3710
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3616
3711
|
);
|
|
3617
|
-
const
|
|
3618
|
-
const
|
|
3712
|
+
const isGemini = provider === PROVIDER_GEMINI2;
|
|
3713
|
+
const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
3714
|
+
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
3715
|
+
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
3716
|
+
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
3717
|
+
const computedProviderOpts = {
|
|
3619
3718
|
...isAnthropic && {
|
|
3620
3719
|
anthropic: {
|
|
3621
|
-
thinking: { type: "enabled", budgetTokens:
|
|
3720
|
+
thinking: { type: "enabled", budgetTokens: thinkingBudgetTokens }
|
|
3622
3721
|
}
|
|
3623
3722
|
},
|
|
3624
3723
|
...isResponsesAPI && {
|
|
@@ -3626,15 +3725,30 @@ async function executeWithAiSdk(context) {
|
|
|
3626
3725
|
// Prevent the SDK from sending item_reference inputs that the proxy can't forward
|
|
3627
3726
|
store: false,
|
|
3628
3727
|
forceReasoning: true,
|
|
3629
|
-
reasoningEffort
|
|
3728
|
+
reasoningEffort,
|
|
3630
3729
|
reasoningSummary: "detailed"
|
|
3631
3730
|
}
|
|
3731
|
+
},
|
|
3732
|
+
...isGeminiThinking && {
|
|
3733
|
+
google: {
|
|
3734
|
+
thinkingConfig: {
|
|
3735
|
+
includeThoughts: true,
|
|
3736
|
+
thinkingBudget: 1e4
|
|
3737
|
+
}
|
|
3738
|
+
}
|
|
3632
3739
|
}
|
|
3633
3740
|
};
|
|
3741
|
+
const { providerOptions: extraProviderOptions, ...topLevelExtras } = configExtras;
|
|
3742
|
+
const mergedProviderOptions = {
|
|
3743
|
+
...extraProviderOptions && typeof extraProviderOptions === "object" ? extraProviderOptions : {},
|
|
3744
|
+
...computedProviderOpts
|
|
3745
|
+
};
|
|
3634
3746
|
const stepTimestamps = [];
|
|
3747
|
+
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3635
3748
|
const { triggerPromptImages } = context;
|
|
3636
3749
|
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3637
|
-
const
|
|
3750
|
+
const result = await (0, import_ai.generateText)({
|
|
3751
|
+
...topLevelExtras,
|
|
3638
3752
|
model,
|
|
3639
3753
|
system: systemPrompt,
|
|
3640
3754
|
...hasImages ? {
|
|
@@ -3652,11 +3766,16 @@ async function executeWithAiSdk(context) {
|
|
|
3652
3766
|
}
|
|
3653
3767
|
]
|
|
3654
3768
|
} : { prompt: scenario.triggerPrompt },
|
|
3655
|
-
temperature: supportsThinking ? void 0 :
|
|
3656
|
-
|
|
3769
|
+
temperature: supportsThinking ? void 0 : cfg.temperature,
|
|
3770
|
+
topP: supportsThinking ? void 0 : cfg.topP,
|
|
3771
|
+
frequencyPenalty: cfg.frequencyPenalty,
|
|
3772
|
+
presencePenalty: cfg.presencePenalty,
|
|
3773
|
+
seed: cfg.seed,
|
|
3774
|
+
stopSequences: cfg.stopSequences,
|
|
3775
|
+
maxOutputTokens: cfg.maxTokens,
|
|
3657
3776
|
tools: mcpTools,
|
|
3658
|
-
|
|
3659
|
-
providerOptions:
|
|
3777
|
+
...mcpTools && effectiveMaxTurns != null ? { stopWhen: (0, import_ai.stepCountIs)(effectiveMaxTurns) } : !mcpTools ? { stopWhen: (0, import_ai.stepCountIs)(1) } : {},
|
|
3778
|
+
providerOptions: mergedProviderOptions,
|
|
3660
3779
|
onStepFinish: (step) => {
|
|
3661
3780
|
stepTimestamps.push(Date.now());
|
|
3662
3781
|
if (traceContext) {
|
|
@@ -3684,8 +3803,7 @@ async function executeWithAiSdk(context) {
|
|
|
3684
3803
|
);
|
|
3685
3804
|
}
|
|
3686
3805
|
}
|
|
3687
|
-
};
|
|
3688
|
-
const result = await (0, import_ai.generateText)(generateTextParams);
|
|
3806
|
+
});
|
|
3689
3807
|
const durationMs = Date.now() - startTime;
|
|
3690
3808
|
const usage = {
|
|
3691
3809
|
inputTokens: result.usage.inputTokens ?? 0,
|
|
@@ -3696,7 +3814,7 @@ async function executeWithAiSdk(context) {
|
|
|
3696
3814
|
result.steps,
|
|
3697
3815
|
durationMs,
|
|
3698
3816
|
usage,
|
|
3699
|
-
|
|
3817
|
+
modelId,
|
|
3700
3818
|
provider,
|
|
3701
3819
|
startTime,
|
|
3702
3820
|
stepTimestamps
|
|
@@ -4609,11 +4727,12 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4609
4727
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4610
4728
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4611
4729
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4730
|
+
const agentConfig = agent?.config;
|
|
4612
4731
|
const executionContext = {
|
|
4613
4732
|
skills: evalData.skills,
|
|
4614
4733
|
scenario,
|
|
4615
4734
|
cwd: workDir || process.cwd(),
|
|
4616
|
-
|
|
4735
|
+
config: agentConfig,
|
|
4617
4736
|
aiGatewayUrl: config.aiGatewayUrl,
|
|
4618
4737
|
aiGatewayHeaders: config.aiGatewayHeaders,
|
|
4619
4738
|
traceContext: {
|
|
@@ -4648,7 +4767,18 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4648
4767
|
infrastructurePaths
|
|
4649
4768
|
);
|
|
4650
4769
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4651
|
-
const
|
|
4770
|
+
const snapshotModelConfig = agentConfig?.model ? {
|
|
4771
|
+
model: agentConfig.model,
|
|
4772
|
+
...agentConfig.temperature != null && {
|
|
4773
|
+
temperature: agentConfig.temperature
|
|
4774
|
+
},
|
|
4775
|
+
...agentConfig.maxTokens != null && {
|
|
4776
|
+
maxTokens: agentConfig.maxTokens
|
|
4777
|
+
},
|
|
4778
|
+
...agentConfig.maxTurns != null && {
|
|
4779
|
+
maxTurns: agentConfig.maxTurns
|
|
4780
|
+
}
|
|
4781
|
+
} : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
4652
4782
|
return {
|
|
4653
4783
|
id: (0, import_crypto4.randomUUID)(),
|
|
4654
4784
|
targetId,
|
|
@@ -4656,7 +4786,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4656
4786
|
scenarioId: scenario.id,
|
|
4657
4787
|
scenarioName: scenario.name,
|
|
4658
4788
|
triggerPrompt: scenario.triggerPrompt,
|
|
4659
|
-
modelConfig:
|
|
4789
|
+
modelConfig: snapshotModelConfig,
|
|
4660
4790
|
duration: durationMs,
|
|
4661
4791
|
outputText,
|
|
4662
4792
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|