@wix/evalforge-evaluator 0.147.0 → 0.148.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +137 -58
- package/build/index.js.map +2 -2
- package/build/index.mjs +143 -62
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/types.d.ts +15 -3
- package/build/types/run-scenario/agents/opencode/config.d.ts +3 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +8 -0
- package/package.json +5 -5
package/build/index.mjs
CHANGED
|
@@ -315,10 +315,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
315
315
|
if (evalRun.agentId) {
|
|
316
316
|
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
317
317
|
}
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
318
|
+
const skills = [];
|
|
319
|
+
const mcps = [];
|
|
320
|
+
const subAgents = [];
|
|
321
|
+
const rules = [];
|
|
322
322
|
if (evalRun.capabilityIds && evalRun.capabilityIds.length > 0) {
|
|
323
323
|
const fetchResults = await Promise.allSettled(
|
|
324
324
|
evalRun.capabilityIds.map((id) => api.getCapability(projectId2, id))
|
|
@@ -605,7 +605,7 @@ function getAdapter(identifier) {
|
|
|
605
605
|
}
|
|
606
606
|
|
|
607
607
|
// src/run-scenario/agents/claude-code/claude-code-adapter.ts
|
|
608
|
-
import { AgentRunCommand } from "@wix/evalforge-types";
|
|
608
|
+
import { AgentRunCommand, ClaudeCodeConfigSchema } from "@wix/evalforge-types";
|
|
609
609
|
|
|
610
610
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
611
611
|
import {
|
|
@@ -1187,10 +1187,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1187
1187
|
let lastAction = "Starting...";
|
|
1188
1188
|
let lastToolName;
|
|
1189
1189
|
let lastFilePath;
|
|
1190
|
-
const maxTurns = options.maxTurns
|
|
1190
|
+
const maxTurns = options.maxTurns || void 0;
|
|
1191
1191
|
let messageCount = 0;
|
|
1192
1192
|
const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
|
|
1193
|
-
const baseAllowedTools = [
|
|
1193
|
+
const baseAllowedTools = options.allowedTools ?? [
|
|
1194
1194
|
"Skill",
|
|
1195
1195
|
"Read",
|
|
1196
1196
|
"Write",
|
|
@@ -1204,13 +1204,17 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1204
1204
|
];
|
|
1205
1205
|
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
1206
1206
|
const queryOptions = {
|
|
1207
|
+
...options.extras ?? {},
|
|
1207
1208
|
env: sdkEnv,
|
|
1208
1209
|
cwd: options.cwd,
|
|
1209
1210
|
settingSources: ["project"],
|
|
1210
1211
|
allowedTools,
|
|
1212
|
+
...options.disallowedTools?.length ? { disallowedTools: options.disallowedTools } : {},
|
|
1211
1213
|
model: options.model || DEFAULT_MODEL,
|
|
1212
1214
|
maxTurns,
|
|
1213
1215
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
1216
|
+
...options.effort ? { effort: options.effort } : {},
|
|
1217
|
+
...options.maxBudgetUsd != null ? { maxBudgetUsd: options.maxBudgetUsd } : {},
|
|
1214
1218
|
// Use 'default' permission mode with custom canUseTool handler
|
|
1215
1219
|
// instead of 'bypassPermissions' which fails on root
|
|
1216
1220
|
permissionMode: "default",
|
|
@@ -1285,7 +1289,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1285
1289
|
traceContext.authToken
|
|
1286
1290
|
);
|
|
1287
1291
|
}
|
|
1288
|
-
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
1292
|
+
const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
1289
1293
|
let timeoutHandle;
|
|
1290
1294
|
let timedOut = false;
|
|
1291
1295
|
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
@@ -2049,7 +2053,7 @@ var ClaudeCodeAdapter = class {
|
|
|
2049
2053
|
skills,
|
|
2050
2054
|
scenario,
|
|
2051
2055
|
cwd,
|
|
2052
|
-
|
|
2056
|
+
config,
|
|
2053
2057
|
aiGatewayUrl,
|
|
2054
2058
|
aiGatewayHeaders,
|
|
2055
2059
|
traceContext,
|
|
@@ -2058,20 +2062,37 @@ var ClaudeCodeAdapter = class {
|
|
|
2058
2062
|
rules,
|
|
2059
2063
|
systemPrompt
|
|
2060
2064
|
} = context;
|
|
2061
|
-
const
|
|
2065
|
+
const typed = config ? ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
2066
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
2067
|
+
const schemaKeys = new Set(Object.keys(ClaudeCodeConfigSchema.shape));
|
|
2068
|
+
const extras = {};
|
|
2069
|
+
if (config) {
|
|
2070
|
+
for (const [key, value] of Object.entries(config)) {
|
|
2071
|
+
if (!schemaKeys.has(key)) extras[key] = value;
|
|
2072
|
+
}
|
|
2073
|
+
}
|
|
2074
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
2075
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
2062
2076
|
const options = {
|
|
2063
2077
|
cwd,
|
|
2064
|
-
model:
|
|
2065
|
-
temperature:
|
|
2066
|
-
maxTokens:
|
|
2067
|
-
maxTurns
|
|
2078
|
+
model: cfg?.model,
|
|
2079
|
+
temperature: cfg?.temperature,
|
|
2080
|
+
maxTokens: cfg?.maxTokens,
|
|
2081
|
+
maxTurns,
|
|
2082
|
+
maxThinkingTokens: cfg?.maxThinkingTokens,
|
|
2083
|
+
allowedTools: cfg?.allowedTools,
|
|
2084
|
+
disallowedTools: cfg?.disallowedTools,
|
|
2085
|
+
effort: cfg?.effort,
|
|
2086
|
+
maxBudgetUsd: cfg?.maxBudgetUsd,
|
|
2087
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
2068
2088
|
aiGatewayUrl,
|
|
2069
2089
|
aiGatewayHeaders,
|
|
2070
2090
|
traceContext,
|
|
2071
2091
|
mcps,
|
|
2072
2092
|
subAgents,
|
|
2073
2093
|
rules,
|
|
2074
|
-
systemPrompt
|
|
2094
|
+
systemPrompt,
|
|
2095
|
+
extras
|
|
2075
2096
|
};
|
|
2076
2097
|
const { result, llmTrace, conversation } = await executeWithClaudeCode(
|
|
2077
2098
|
skills,
|
|
@@ -2098,7 +2119,7 @@ var claudeCodeAdapter = new ClaudeCodeAdapter();
|
|
|
2098
2119
|
defaultRegistry.register(claudeCodeAdapter);
|
|
2099
2120
|
|
|
2100
2121
|
// src/run-scenario/agents/opencode/opencode-adapter.ts
|
|
2101
|
-
import { AgentRunCommand as AgentRunCommand2 } from "@wix/evalforge-types";
|
|
2122
|
+
import { AgentRunCommand as AgentRunCommand2, OpenCodeConfigSchema as OpenCodeConfigSchema2 } from "@wix/evalforge-types";
|
|
2102
2123
|
|
|
2103
2124
|
// src/run-scenario/agents/opencode/execute.ts
|
|
2104
2125
|
import { spawn } from "child_process";
|
|
@@ -2217,25 +2238,33 @@ import { homedir as homedir2 } from "os";
|
|
|
2217
2238
|
import {
|
|
2218
2239
|
ClaudeModel as ClaudeModel2,
|
|
2219
2240
|
AVAILABLE_OPENAI_MODEL_IDS,
|
|
2220
|
-
AVAILABLE_GEMINI_MODEL_IDS
|
|
2241
|
+
AVAILABLE_GEMINI_MODEL_IDS,
|
|
2242
|
+
OpenCodeConfigSchema
|
|
2221
2243
|
} from "@wix/evalforge-types";
|
|
2222
2244
|
var DEFAULT_MODEL2 = `${ClaudeModel2.CLAUDE_4_5_SONNET_1_0}`;
|
|
2245
|
+
var OPENCODE_MODEL_ALIASES = {
|
|
2246
|
+
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
2247
|
+
"claude-opus-4": "claude-opus-4-0"
|
|
2248
|
+
};
|
|
2223
2249
|
function parseModel(model) {
|
|
2224
2250
|
const slashIndex = model.indexOf("/");
|
|
2225
2251
|
if (slashIndex > 0) {
|
|
2252
|
+
const providerID = model.slice(0, slashIndex);
|
|
2253
|
+
const rawModelID = model.slice(slashIndex + 1);
|
|
2226
2254
|
return {
|
|
2227
|
-
providerID
|
|
2228
|
-
modelID:
|
|
2255
|
+
providerID,
|
|
2256
|
+
modelID: OPENCODE_MODEL_ALIASES[rawModelID] ?? rawModelID
|
|
2229
2257
|
};
|
|
2230
2258
|
}
|
|
2259
|
+
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
2231
2260
|
const isOpenAI = AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2232
2261
|
model
|
|
2233
2262
|
);
|
|
2234
2263
|
const isGemini = AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
2235
2264
|
model
|
|
2236
2265
|
);
|
|
2237
|
-
if (isGemini) return { providerID: "google", modelID
|
|
2238
|
-
return { providerID: isOpenAI ? "openai" : "anthropic", modelID
|
|
2266
|
+
if (isGemini) return { providerID: "google", modelID };
|
|
2267
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID };
|
|
2239
2268
|
}
|
|
2240
2269
|
function toOpenCodeMcpConfig(servers) {
|
|
2241
2270
|
const result = {};
|
|
@@ -2319,20 +2348,26 @@ async function buildOpenCodeEnv(options) {
|
|
|
2319
2348
|
if (options.temperature != null) {
|
|
2320
2349
|
agentOverrides.temperature = options.temperature;
|
|
2321
2350
|
}
|
|
2322
|
-
if (options.
|
|
2351
|
+
if (options.topP != null) {
|
|
2352
|
+
agentOverrides.top_p = options.topP;
|
|
2353
|
+
}
|
|
2354
|
+
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
2323
2355
|
agentOverrides.maxSteps = options.maxTurns;
|
|
2324
2356
|
}
|
|
2357
|
+
const parsed = options.config ? OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
2358
|
+
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
2359
|
+
const defaultPermission = {
|
|
2360
|
+
"*": "allow"
|
|
2361
|
+
};
|
|
2362
|
+
const permission = {
|
|
2363
|
+
...defaultPermission,
|
|
2364
|
+
...configPermission
|
|
2365
|
+
};
|
|
2325
2366
|
const config = {
|
|
2326
2367
|
model: `${providerID}/${modelID}`,
|
|
2327
2368
|
provider,
|
|
2328
2369
|
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2329
|
-
permission
|
|
2330
|
-
edit: "allow",
|
|
2331
|
-
bash: "allow",
|
|
2332
|
-
webfetch: "allow",
|
|
2333
|
-
doom_loop: "allow",
|
|
2334
|
-
external_directory: "allow"
|
|
2335
|
-
},
|
|
2370
|
+
permission,
|
|
2336
2371
|
...mcp ? { mcp } : {}
|
|
2337
2372
|
};
|
|
2338
2373
|
const env = {
|
|
@@ -3066,16 +3101,18 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3066
3101
|
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
3067
3102
|
model: options.model
|
|
3068
3103
|
});
|
|
3069
|
-
const maxTurns = options.maxTurns
|
|
3070
|
-
const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
|
|
3104
|
+
const maxTurns = options.maxTurns || void 0;
|
|
3105
|
+
const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
3071
3106
|
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3072
3107
|
model: options.model,
|
|
3073
3108
|
temperature: options.temperature,
|
|
3109
|
+
topP: options.topP,
|
|
3074
3110
|
maxTurns,
|
|
3075
3111
|
aiGatewayUrl: options.aiGatewayUrl,
|
|
3076
3112
|
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
3077
3113
|
mcps: options.mcps,
|
|
3078
|
-
cwd: options.cwd
|
|
3114
|
+
cwd: options.cwd,
|
|
3115
|
+
config: options.config
|
|
3079
3116
|
});
|
|
3080
3117
|
const startTime = /* @__PURE__ */ new Date();
|
|
3081
3118
|
const traceContext = options.traceContext;
|
|
@@ -3103,13 +3140,13 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3103
3140
|
traceContext.authToken
|
|
3104
3141
|
);
|
|
3105
3142
|
}
|
|
3143
|
+
const variant = options.thinkingVariant ?? "high";
|
|
3144
|
+
const thinkingArgs = variant === "none" ? [] : ["--thinking", "--variant", variant];
|
|
3106
3145
|
const baseArgs = [
|
|
3107
3146
|
"run",
|
|
3108
3147
|
"--format",
|
|
3109
3148
|
"json",
|
|
3110
|
-
|
|
3111
|
-
"--variant",
|
|
3112
|
-
"high",
|
|
3149
|
+
...thinkingArgs,
|
|
3113
3150
|
"--model",
|
|
3114
3151
|
`${providerID}/${modelID}`,
|
|
3115
3152
|
"--dir",
|
|
@@ -3300,7 +3337,7 @@ var OpenCodeAdapter = class {
|
|
|
3300
3337
|
skills,
|
|
3301
3338
|
scenario,
|
|
3302
3339
|
cwd,
|
|
3303
|
-
|
|
3340
|
+
config,
|
|
3304
3341
|
aiGatewayUrl,
|
|
3305
3342
|
aiGatewayHeaders,
|
|
3306
3343
|
traceContext,
|
|
@@ -3309,18 +3346,26 @@ var OpenCodeAdapter = class {
|
|
|
3309
3346
|
rules,
|
|
3310
3347
|
systemPrompt
|
|
3311
3348
|
} = context;
|
|
3349
|
+
const typed = config ? OpenCodeConfigSchema2.passthrough().safeParse(config) : void 0;
|
|
3350
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3351
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
3352
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
3312
3353
|
const options = {
|
|
3313
3354
|
cwd,
|
|
3314
|
-
model:
|
|
3315
|
-
temperature:
|
|
3316
|
-
|
|
3355
|
+
model: cfg?.model,
|
|
3356
|
+
temperature: cfg?.temperature,
|
|
3357
|
+
topP: cfg?.topP,
|
|
3358
|
+
maxTurns,
|
|
3359
|
+
thinkingVariant: cfg?.thinkingVariant,
|
|
3360
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
3317
3361
|
aiGatewayUrl,
|
|
3318
3362
|
aiGatewayHeaders,
|
|
3319
3363
|
traceContext,
|
|
3320
3364
|
mcps,
|
|
3321
3365
|
subAgents,
|
|
3322
3366
|
rules,
|
|
3323
|
-
systemPrompt
|
|
3367
|
+
systemPrompt,
|
|
3368
|
+
config
|
|
3324
3369
|
};
|
|
3325
3370
|
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3326
3371
|
skills,
|
|
@@ -3360,7 +3405,8 @@ import {
|
|
|
3360
3405
|
GEMINI_THINKING_MODEL_IDS,
|
|
3361
3406
|
OPENAI_RESPONSES_MODEL_IDS,
|
|
3362
3407
|
LLMStepType as LLMStepType3,
|
|
3363
|
-
LiveTraceEventType as LiveTraceEventType3
|
|
3408
|
+
LiveTraceEventType as LiveTraceEventType3,
|
|
3409
|
+
SimpleAgentConfigSchema
|
|
3364
3410
|
} from "@wix/evalforge-types";
|
|
3365
3411
|
import { randomUUID as randomUUID3 } from "crypto";
|
|
3366
3412
|
|
|
@@ -3646,20 +3692,30 @@ async function executeWithAiSdk(context) {
|
|
|
3646
3692
|
const {
|
|
3647
3693
|
scenario,
|
|
3648
3694
|
cwd,
|
|
3649
|
-
|
|
3695
|
+
config,
|
|
3650
3696
|
aiGatewayUrl,
|
|
3651
3697
|
aiGatewayHeaders = {},
|
|
3652
3698
|
mcps,
|
|
3653
3699
|
traceContext
|
|
3654
3700
|
} = context;
|
|
3701
|
+
const typed = config ? SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
3702
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3703
|
+
const schemaKeys = new Set(Object.keys(SimpleAgentConfigSchema.shape));
|
|
3704
|
+
const configExtras = {};
|
|
3705
|
+
if (config) {
|
|
3706
|
+
for (const [key, value] of Object.entries(config)) {
|
|
3707
|
+
if (!schemaKeys.has(key)) configExtras[key] = value;
|
|
3708
|
+
}
|
|
3709
|
+
}
|
|
3655
3710
|
if (!aiGatewayUrl) {
|
|
3656
3711
|
throw new Error("Simple Agent requires aiGatewayUrl");
|
|
3657
3712
|
}
|
|
3658
|
-
if (!
|
|
3659
|
-
throw new Error("Simple Agent requires a model in
|
|
3713
|
+
if (!cfg?.model) {
|
|
3714
|
+
throw new Error("Simple Agent requires a model in config");
|
|
3660
3715
|
}
|
|
3661
|
-
const
|
|
3662
|
-
const
|
|
3716
|
+
const modelId = cfg.model;
|
|
3717
|
+
const model = createModel(modelId, aiGatewayUrl, aiGatewayHeaders);
|
|
3718
|
+
const provider = isClaudeModelId(modelId) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelId) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
|
|
3663
3719
|
const systemPrompt = composeSystemPrompt(context);
|
|
3664
3720
|
const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
|
|
3665
3721
|
const startTime = Date.now();
|
|
@@ -3669,15 +3725,17 @@ async function executeWithAiSdk(context) {
|
|
|
3669
3725
|
try {
|
|
3670
3726
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
3671
3727
|
const isResponsesAPI = [...OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3672
|
-
(id) =>
|
|
3728
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3673
3729
|
);
|
|
3674
3730
|
const isGemini = provider === PROVIDER_GEMINI2;
|
|
3675
|
-
const isGeminiThinking = isGemini && GEMINI_THINKING_MODEL_IDS.has(
|
|
3731
|
+
const isGeminiThinking = isGemini && GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
3676
3732
|
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
3677
|
-
const
|
|
3733
|
+
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
3734
|
+
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
3735
|
+
const computedProviderOpts = {
|
|
3678
3736
|
...isAnthropic && {
|
|
3679
3737
|
anthropic: {
|
|
3680
|
-
thinking: { type: "enabled", budgetTokens:
|
|
3738
|
+
thinking: { type: "enabled", budgetTokens: thinkingBudgetTokens }
|
|
3681
3739
|
}
|
|
3682
3740
|
},
|
|
3683
3741
|
...isResponsesAPI && {
|
|
@@ -3685,7 +3743,7 @@ async function executeWithAiSdk(context) {
|
|
|
3685
3743
|
// Prevent the SDK from sending item_reference inputs that the proxy can't forward
|
|
3686
3744
|
store: false,
|
|
3687
3745
|
forceReasoning: true,
|
|
3688
|
-
reasoningEffort
|
|
3746
|
+
reasoningEffort,
|
|
3689
3747
|
reasoningSummary: "detailed"
|
|
3690
3748
|
}
|
|
3691
3749
|
},
|
|
@@ -3698,10 +3756,17 @@ async function executeWithAiSdk(context) {
|
|
|
3698
3756
|
}
|
|
3699
3757
|
}
|
|
3700
3758
|
};
|
|
3759
|
+
const { providerOptions: extraProviderOptions, ...topLevelExtras } = configExtras;
|
|
3760
|
+
const mergedProviderOptions = {
|
|
3761
|
+
...extraProviderOptions && typeof extraProviderOptions === "object" ? extraProviderOptions : {},
|
|
3762
|
+
...computedProviderOpts
|
|
3763
|
+
};
|
|
3701
3764
|
const stepTimestamps = [];
|
|
3765
|
+
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3702
3766
|
const { triggerPromptImages } = context;
|
|
3703
3767
|
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3704
|
-
const
|
|
3768
|
+
const result = await generateText({
|
|
3769
|
+
...topLevelExtras,
|
|
3705
3770
|
model,
|
|
3706
3771
|
system: systemPrompt,
|
|
3707
3772
|
...hasImages ? {
|
|
@@ -3719,11 +3784,16 @@ async function executeWithAiSdk(context) {
|
|
|
3719
3784
|
}
|
|
3720
3785
|
]
|
|
3721
3786
|
} : { prompt: scenario.triggerPrompt },
|
|
3722
|
-
temperature: supportsThinking ? void 0 :
|
|
3723
|
-
|
|
3787
|
+
temperature: supportsThinking ? void 0 : cfg.temperature,
|
|
3788
|
+
topP: supportsThinking ? void 0 : cfg.topP,
|
|
3789
|
+
frequencyPenalty: cfg.frequencyPenalty,
|
|
3790
|
+
presencePenalty: cfg.presencePenalty,
|
|
3791
|
+
seed: cfg.seed,
|
|
3792
|
+
stopSequences: cfg.stopSequences,
|
|
3793
|
+
maxOutputTokens: cfg.maxTokens,
|
|
3724
3794
|
tools: mcpTools,
|
|
3725
|
-
|
|
3726
|
-
providerOptions:
|
|
3795
|
+
...mcpTools && effectiveMaxTurns != null ? { stopWhen: stepCountIs(effectiveMaxTurns) } : !mcpTools ? { stopWhen: stepCountIs(1) } : {},
|
|
3796
|
+
providerOptions: mergedProviderOptions,
|
|
3727
3797
|
onStepFinish: (step) => {
|
|
3728
3798
|
stepTimestamps.push(Date.now());
|
|
3729
3799
|
if (traceContext) {
|
|
@@ -3751,8 +3821,7 @@ async function executeWithAiSdk(context) {
|
|
|
3751
3821
|
);
|
|
3752
3822
|
}
|
|
3753
3823
|
}
|
|
3754
|
-
};
|
|
3755
|
-
const result = await generateText(generateTextParams);
|
|
3824
|
+
});
|
|
3756
3825
|
const durationMs = Date.now() - startTime;
|
|
3757
3826
|
const usage = {
|
|
3758
3827
|
inputTokens: result.usage.inputTokens ?? 0,
|
|
@@ -3763,7 +3832,7 @@ async function executeWithAiSdk(context) {
|
|
|
3763
3832
|
result.steps,
|
|
3764
3833
|
durationMs,
|
|
3765
3834
|
usage,
|
|
3766
|
-
|
|
3835
|
+
modelId,
|
|
3767
3836
|
provider,
|
|
3768
3837
|
startTime,
|
|
3769
3838
|
stepTimestamps
|
|
@@ -4676,11 +4745,12 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4676
4745
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4677
4746
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4678
4747
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4748
|
+
const agentConfig = agent?.config;
|
|
4679
4749
|
const executionContext = {
|
|
4680
4750
|
skills: evalData.skills,
|
|
4681
4751
|
scenario,
|
|
4682
4752
|
cwd: workDir || process.cwd(),
|
|
4683
|
-
|
|
4753
|
+
config: agentConfig,
|
|
4684
4754
|
aiGatewayUrl: config.aiGatewayUrl,
|
|
4685
4755
|
aiGatewayHeaders: config.aiGatewayHeaders,
|
|
4686
4756
|
traceContext: {
|
|
@@ -4715,7 +4785,18 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4715
4785
|
infrastructurePaths
|
|
4716
4786
|
);
|
|
4717
4787
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4718
|
-
const
|
|
4788
|
+
const snapshotModelConfig = agentConfig?.model ? {
|
|
4789
|
+
model: agentConfig.model,
|
|
4790
|
+
...agentConfig.temperature != null && {
|
|
4791
|
+
temperature: agentConfig.temperature
|
|
4792
|
+
},
|
|
4793
|
+
...agentConfig.maxTokens != null && {
|
|
4794
|
+
maxTokens: agentConfig.maxTokens
|
|
4795
|
+
},
|
|
4796
|
+
...agentConfig.maxTurns != null && {
|
|
4797
|
+
maxTurns: agentConfig.maxTurns
|
|
4798
|
+
}
|
|
4799
|
+
} : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
4719
4800
|
return {
|
|
4720
4801
|
id: randomUUID4(),
|
|
4721
4802
|
targetId,
|
|
@@ -4723,7 +4804,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4723
4804
|
scenarioId: scenario.id,
|
|
4724
4805
|
scenarioName: scenario.name,
|
|
4725
4806
|
triggerPrompt: scenario.triggerPrompt,
|
|
4726
|
-
modelConfig:
|
|
4807
|
+
modelConfig: snapshotModelConfig,
|
|
4727
4808
|
duration: durationMs,
|
|
4728
4809
|
outputText,
|
|
4729
4810
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|