@wix/evalforge-evaluator 0.147.0 → 0.149.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +137 -58
- package/build/index.js.map +2 -2
- package/build/index.mjs +143 -62
- package/build/index.mjs.map +3 -3
- package/build/types/run-scenario/agents/claude-code/types.d.ts +15 -3
- package/build/types/run-scenario/agents/opencode/config.d.ts +3 -0
- package/build/types/run-scenario/agents/opencode/types.d.ts +8 -0
- package/package.json +5 -5
package/build/index.js
CHANGED
|
@@ -330,10 +330,10 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
|
|
|
330
330
|
if (evalRun.agentId) {
|
|
331
331
|
agent = await api.getAgent(projectId2, evalRun.agentId);
|
|
332
332
|
}
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
333
|
+
const skills = [];
|
|
334
|
+
const mcps = [];
|
|
335
|
+
const subAgents = [];
|
|
336
|
+
const rules = [];
|
|
337
337
|
if (evalRun.capabilityIds && evalRun.capabilityIds.length > 0) {
|
|
338
338
|
const fetchResults = await Promise.allSettled(
|
|
339
339
|
evalRun.capabilityIds.map((id) => api.getCapability(projectId2, id))
|
|
@@ -1190,10 +1190,10 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1190
1190
|
let lastAction = "Starting...";
|
|
1191
1191
|
let lastToolName;
|
|
1192
1192
|
let lastFilePath;
|
|
1193
|
-
const maxTurns = options.maxTurns
|
|
1193
|
+
const maxTurns = options.maxTurns || void 0;
|
|
1194
1194
|
let messageCount = 0;
|
|
1195
1195
|
const canUseTool = async (_toolName, input) => ({ behavior: "allow", updatedInput: input });
|
|
1196
|
-
const baseAllowedTools = [
|
|
1196
|
+
const baseAllowedTools = options.allowedTools ?? [
|
|
1197
1197
|
"Skill",
|
|
1198
1198
|
"Read",
|
|
1199
1199
|
"Write",
|
|
@@ -1207,13 +1207,17 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1207
1207
|
];
|
|
1208
1208
|
const allowedTools = (options.mcps?.length ?? 0) > 0 ? [...baseAllowedTools, "mcp__*"] : baseAllowedTools;
|
|
1209
1209
|
const queryOptions = {
|
|
1210
|
+
...options.extras ?? {},
|
|
1210
1211
|
env: sdkEnv,
|
|
1211
1212
|
cwd: options.cwd,
|
|
1212
1213
|
settingSources: ["project"],
|
|
1213
1214
|
allowedTools,
|
|
1215
|
+
...options.disallowedTools?.length ? { disallowedTools: options.disallowedTools } : {},
|
|
1214
1216
|
model: options.model || DEFAULT_MODEL,
|
|
1215
1217
|
maxTurns,
|
|
1216
1218
|
maxThinkingTokens: options.maxThinkingTokens,
|
|
1219
|
+
...options.effort ? { effort: options.effort } : {},
|
|
1220
|
+
...options.maxBudgetUsd != null ? { maxBudgetUsd: options.maxBudgetUsd } : {},
|
|
1217
1221
|
// Use 'default' permission mode with custom canUseTool handler
|
|
1218
1222
|
// instead of 'bypassPermissions' which fails on root
|
|
1219
1223
|
permissionMode: "default",
|
|
@@ -1288,7 +1292,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1288
1292
|
traceContext.authToken
|
|
1289
1293
|
);
|
|
1290
1294
|
}
|
|
1291
|
-
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
1295
|
+
const SDK_TIMEOUT_MS = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
1292
1296
|
let timeoutHandle;
|
|
1293
1297
|
let timedOut = false;
|
|
1294
1298
|
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
@@ -2052,7 +2056,7 @@ var ClaudeCodeAdapter = class {
|
|
|
2052
2056
|
skills,
|
|
2053
2057
|
scenario,
|
|
2054
2058
|
cwd,
|
|
2055
|
-
|
|
2059
|
+
config,
|
|
2056
2060
|
aiGatewayUrl,
|
|
2057
2061
|
aiGatewayHeaders,
|
|
2058
2062
|
traceContext,
|
|
@@ -2061,20 +2065,37 @@ var ClaudeCodeAdapter = class {
|
|
|
2061
2065
|
rules,
|
|
2062
2066
|
systemPrompt
|
|
2063
2067
|
} = context;
|
|
2064
|
-
const
|
|
2068
|
+
const typed = config ? import_evalforge_types5.ClaudeCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
2069
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
2070
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types5.ClaudeCodeConfigSchema.shape));
|
|
2071
|
+
const extras = {};
|
|
2072
|
+
if (config) {
|
|
2073
|
+
for (const [key, value] of Object.entries(config)) {
|
|
2074
|
+
if (!schemaKeys.has(key)) extras[key] = value;
|
|
2075
|
+
}
|
|
2076
|
+
}
|
|
2077
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
2078
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
2065
2079
|
const options = {
|
|
2066
2080
|
cwd,
|
|
2067
|
-
model:
|
|
2068
|
-
temperature:
|
|
2069
|
-
maxTokens:
|
|
2070
|
-
maxTurns
|
|
2081
|
+
model: cfg?.model,
|
|
2082
|
+
temperature: cfg?.temperature,
|
|
2083
|
+
maxTokens: cfg?.maxTokens,
|
|
2084
|
+
maxTurns,
|
|
2085
|
+
maxThinkingTokens: cfg?.maxThinkingTokens,
|
|
2086
|
+
allowedTools: cfg?.allowedTools,
|
|
2087
|
+
disallowedTools: cfg?.disallowedTools,
|
|
2088
|
+
effort: cfg?.effort,
|
|
2089
|
+
maxBudgetUsd: cfg?.maxBudgetUsd,
|
|
2090
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
2071
2091
|
aiGatewayUrl,
|
|
2072
2092
|
aiGatewayHeaders,
|
|
2073
2093
|
traceContext,
|
|
2074
2094
|
mcps,
|
|
2075
2095
|
subAgents,
|
|
2076
2096
|
rules,
|
|
2077
|
-
systemPrompt
|
|
2097
|
+
systemPrompt,
|
|
2098
|
+
extras
|
|
2078
2099
|
};
|
|
2079
2100
|
const { result, llmTrace, conversation } = await executeWithClaudeCode(
|
|
2080
2101
|
skills,
|
|
@@ -2214,22 +2235,29 @@ async function writeSubAgentsToFilesystem2(cwd, subAgents, fetchFn = import_eval
|
|
|
2214
2235
|
var import_os3 = require("os");
|
|
2215
2236
|
var import_evalforge_types6 = require("@wix/evalforge-types");
|
|
2216
2237
|
var DEFAULT_MODEL2 = `${import_evalforge_types6.ClaudeModel.CLAUDE_4_5_SONNET_1_0}`;
|
|
2238
|
+
var OPENCODE_MODEL_ALIASES = {
|
|
2239
|
+
"claude-sonnet-4": "claude-sonnet-4-0",
|
|
2240
|
+
"claude-opus-4": "claude-opus-4-0"
|
|
2241
|
+
};
|
|
2217
2242
|
function parseModel(model) {
|
|
2218
2243
|
const slashIndex = model.indexOf("/");
|
|
2219
2244
|
if (slashIndex > 0) {
|
|
2245
|
+
const providerID = model.slice(0, slashIndex);
|
|
2246
|
+
const rawModelID = model.slice(slashIndex + 1);
|
|
2220
2247
|
return {
|
|
2221
|
-
providerID
|
|
2222
|
-
modelID:
|
|
2248
|
+
providerID,
|
|
2249
|
+
modelID: OPENCODE_MODEL_ALIASES[rawModelID] ?? rawModelID
|
|
2223
2250
|
};
|
|
2224
2251
|
}
|
|
2252
|
+
const modelID = OPENCODE_MODEL_ALIASES[model] ?? model;
|
|
2225
2253
|
const isOpenAI = import_evalforge_types6.AVAILABLE_OPENAI_MODEL_IDS.includes(
|
|
2226
2254
|
model
|
|
2227
2255
|
);
|
|
2228
2256
|
const isGemini = import_evalforge_types6.AVAILABLE_GEMINI_MODEL_IDS.includes(
|
|
2229
2257
|
model
|
|
2230
2258
|
);
|
|
2231
|
-
if (isGemini) return { providerID: "google", modelID
|
|
2232
|
-
return { providerID: isOpenAI ? "openai" : "anthropic", modelID
|
|
2259
|
+
if (isGemini) return { providerID: "google", modelID };
|
|
2260
|
+
return { providerID: isOpenAI ? "openai" : "anthropic", modelID };
|
|
2233
2261
|
}
|
|
2234
2262
|
function toOpenCodeMcpConfig(servers) {
|
|
2235
2263
|
const result = {};
|
|
@@ -2313,20 +2341,26 @@ async function buildOpenCodeEnv(options) {
|
|
|
2313
2341
|
if (options.temperature != null) {
|
|
2314
2342
|
agentOverrides.temperature = options.temperature;
|
|
2315
2343
|
}
|
|
2316
|
-
if (options.
|
|
2344
|
+
if (options.topP != null) {
|
|
2345
|
+
agentOverrides.top_p = options.topP;
|
|
2346
|
+
}
|
|
2347
|
+
if (options.maxTurns != null && options.maxTurns > 0) {
|
|
2317
2348
|
agentOverrides.maxSteps = options.maxTurns;
|
|
2318
2349
|
}
|
|
2350
|
+
const parsed = options.config ? import_evalforge_types6.OpenCodeConfigSchema.passthrough().safeParse(options.config) : void 0;
|
|
2351
|
+
const configPermission = parsed?.success ? parsed.data.permission : void 0;
|
|
2352
|
+
const defaultPermission = {
|
|
2353
|
+
"*": "allow"
|
|
2354
|
+
};
|
|
2355
|
+
const permission = {
|
|
2356
|
+
...defaultPermission,
|
|
2357
|
+
...configPermission
|
|
2358
|
+
};
|
|
2319
2359
|
const config = {
|
|
2320
2360
|
model: `${providerID}/${modelID}`,
|
|
2321
2361
|
provider,
|
|
2322
2362
|
...Object.keys(agentOverrides).length > 0 ? { agent: { build: agentOverrides } } : {},
|
|
2323
|
-
permission
|
|
2324
|
-
edit: "allow",
|
|
2325
|
-
bash: "allow",
|
|
2326
|
-
webfetch: "allow",
|
|
2327
|
-
doom_loop: "allow",
|
|
2328
|
-
external_directory: "allow"
|
|
2329
|
-
},
|
|
2363
|
+
permission,
|
|
2330
2364
|
...mcp ? { mcp } : {}
|
|
2331
2365
|
};
|
|
2332
2366
|
const env = {
|
|
@@ -3060,16 +3094,18 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3060
3094
|
hasAiGatewayHeaders: !!options.aiGatewayHeaders,
|
|
3061
3095
|
model: options.model
|
|
3062
3096
|
});
|
|
3063
|
-
const maxTurns = options.maxTurns
|
|
3064
|
-
const sdkTimeoutMs = Math.max(3e5, maxTurns * 6e4);
|
|
3097
|
+
const maxTurns = options.maxTurns || void 0;
|
|
3098
|
+
const sdkTimeoutMs = options.maxDurationMs ?? Math.max(3e5, (maxTurns ?? 10) * 6e4);
|
|
3065
3099
|
const { env, providerID, modelID } = await buildOpenCodeEnv({
|
|
3066
3100
|
model: options.model,
|
|
3067
3101
|
temperature: options.temperature,
|
|
3102
|
+
topP: options.topP,
|
|
3068
3103
|
maxTurns,
|
|
3069
3104
|
aiGatewayUrl: options.aiGatewayUrl,
|
|
3070
3105
|
aiGatewayHeaders: options.aiGatewayHeaders,
|
|
3071
3106
|
mcps: options.mcps,
|
|
3072
|
-
cwd: options.cwd
|
|
3107
|
+
cwd: options.cwd,
|
|
3108
|
+
config: options.config
|
|
3073
3109
|
});
|
|
3074
3110
|
const startTime = /* @__PURE__ */ new Date();
|
|
3075
3111
|
const traceContext = options.traceContext;
|
|
@@ -3097,13 +3133,13 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3097
3133
|
traceContext.authToken
|
|
3098
3134
|
);
|
|
3099
3135
|
}
|
|
3136
|
+
const variant = options.thinkingVariant ?? "high";
|
|
3137
|
+
const thinkingArgs = variant === "none" ? [] : ["--thinking", "--variant", variant];
|
|
3100
3138
|
const baseArgs = [
|
|
3101
3139
|
"run",
|
|
3102
3140
|
"--format",
|
|
3103
3141
|
"json",
|
|
3104
|
-
|
|
3105
|
-
"--variant",
|
|
3106
|
-
"high",
|
|
3142
|
+
...thinkingArgs,
|
|
3107
3143
|
"--model",
|
|
3108
3144
|
`${providerID}/${modelID}`,
|
|
3109
3145
|
"--dir",
|
|
@@ -3294,7 +3330,7 @@ var OpenCodeAdapter = class {
|
|
|
3294
3330
|
skills,
|
|
3295
3331
|
scenario,
|
|
3296
3332
|
cwd,
|
|
3297
|
-
|
|
3333
|
+
config,
|
|
3298
3334
|
aiGatewayUrl,
|
|
3299
3335
|
aiGatewayHeaders,
|
|
3300
3336
|
traceContext,
|
|
@@ -3303,18 +3339,26 @@ var OpenCodeAdapter = class {
|
|
|
3303
3339
|
rules,
|
|
3304
3340
|
systemPrompt
|
|
3305
3341
|
} = context;
|
|
3342
|
+
const typed = config ? import_evalforge_types9.OpenCodeConfigSchema.passthrough().safeParse(config) : void 0;
|
|
3343
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3344
|
+
const rawMaxTurns = cfg?.maxTurns;
|
|
3345
|
+
const maxTurns = rawMaxTurns === 0 ? void 0 : rawMaxTurns;
|
|
3306
3346
|
const options = {
|
|
3307
3347
|
cwd,
|
|
3308
|
-
model:
|
|
3309
|
-
temperature:
|
|
3310
|
-
|
|
3348
|
+
model: cfg?.model,
|
|
3349
|
+
temperature: cfg?.temperature,
|
|
3350
|
+
topP: cfg?.topP,
|
|
3351
|
+
maxTurns,
|
|
3352
|
+
thinkingVariant: cfg?.thinkingVariant,
|
|
3353
|
+
maxDurationMs: cfg?.maxDurationMs,
|
|
3311
3354
|
aiGatewayUrl,
|
|
3312
3355
|
aiGatewayHeaders,
|
|
3313
3356
|
traceContext,
|
|
3314
3357
|
mcps,
|
|
3315
3358
|
subAgents,
|
|
3316
3359
|
rules,
|
|
3317
|
-
systemPrompt
|
|
3360
|
+
systemPrompt,
|
|
3361
|
+
config
|
|
3318
3362
|
};
|
|
3319
3363
|
const { result, llmTrace, conversation } = await executeWithOpenCode(
|
|
3320
3364
|
skills,
|
|
@@ -3630,20 +3674,30 @@ async function executeWithAiSdk(context) {
|
|
|
3630
3674
|
const {
|
|
3631
3675
|
scenario,
|
|
3632
3676
|
cwd,
|
|
3633
|
-
|
|
3677
|
+
config,
|
|
3634
3678
|
aiGatewayUrl,
|
|
3635
3679
|
aiGatewayHeaders = {},
|
|
3636
3680
|
mcps,
|
|
3637
3681
|
traceContext
|
|
3638
3682
|
} = context;
|
|
3683
|
+
const typed = config ? import_evalforge_types11.SimpleAgentConfigSchema.passthrough().safeParse(config) : void 0;
|
|
3684
|
+
const cfg = typed?.success ? typed.data : void 0;
|
|
3685
|
+
const schemaKeys = new Set(Object.keys(import_evalforge_types11.SimpleAgentConfigSchema.shape));
|
|
3686
|
+
const configExtras = {};
|
|
3687
|
+
if (config) {
|
|
3688
|
+
for (const [key, value] of Object.entries(config)) {
|
|
3689
|
+
if (!schemaKeys.has(key)) configExtras[key] = value;
|
|
3690
|
+
}
|
|
3691
|
+
}
|
|
3639
3692
|
if (!aiGatewayUrl) {
|
|
3640
3693
|
throw new Error("Simple Agent requires aiGatewayUrl");
|
|
3641
3694
|
}
|
|
3642
|
-
if (!
|
|
3643
|
-
throw new Error("Simple Agent requires a model in
|
|
3695
|
+
if (!cfg?.model) {
|
|
3696
|
+
throw new Error("Simple Agent requires a model in config");
|
|
3644
3697
|
}
|
|
3645
|
-
const
|
|
3646
|
-
const
|
|
3698
|
+
const modelId = cfg.model;
|
|
3699
|
+
const model = createModel(modelId, aiGatewayUrl, aiGatewayHeaders);
|
|
3700
|
+
const provider = isClaudeModelId(modelId) ? PROVIDER_ANTHROPIC2 : isGeminiModelId(modelId) ? PROVIDER_GEMINI2 : PROVIDER_OPENAI;
|
|
3647
3701
|
const systemPrompt = composeSystemPrompt(context);
|
|
3648
3702
|
const { tools: mcpTools, clients } = mcps && mcps.length > 0 ? await buildMcpTools(mcps, cwd) : { tools: void 0, clients: [] };
|
|
3649
3703
|
const startTime = Date.now();
|
|
@@ -3653,15 +3707,17 @@ async function executeWithAiSdk(context) {
|
|
|
3653
3707
|
try {
|
|
3654
3708
|
const isAnthropic = provider === PROVIDER_ANTHROPIC2;
|
|
3655
3709
|
const isResponsesAPI = [...import_evalforge_types11.OPENAI_RESPONSES_MODEL_IDS].some(
|
|
3656
|
-
(id) =>
|
|
3710
|
+
(id) => modelId === id || modelId.startsWith(id)
|
|
3657
3711
|
);
|
|
3658
3712
|
const isGemini = provider === PROVIDER_GEMINI2;
|
|
3659
|
-
const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(
|
|
3713
|
+
const isGeminiThinking = isGemini && import_evalforge_types11.GEMINI_THINKING_MODEL_IDS.has(modelId);
|
|
3660
3714
|
const supportsThinking = isAnthropic || isResponsesAPI || isGeminiThinking;
|
|
3661
|
-
const
|
|
3715
|
+
const thinkingBudgetTokens = cfg.thinkingBudgetTokens ?? 1e4;
|
|
3716
|
+
const reasoningEffort = cfg.reasoningEffort ?? "high";
|
|
3717
|
+
const computedProviderOpts = {
|
|
3662
3718
|
...isAnthropic && {
|
|
3663
3719
|
anthropic: {
|
|
3664
|
-
thinking: { type: "enabled", budgetTokens:
|
|
3720
|
+
thinking: { type: "enabled", budgetTokens: thinkingBudgetTokens }
|
|
3665
3721
|
}
|
|
3666
3722
|
},
|
|
3667
3723
|
...isResponsesAPI && {
|
|
@@ -3669,7 +3725,7 @@ async function executeWithAiSdk(context) {
|
|
|
3669
3725
|
// Prevent the SDK from sending item_reference inputs that the proxy can't forward
|
|
3670
3726
|
store: false,
|
|
3671
3727
|
forceReasoning: true,
|
|
3672
|
-
reasoningEffort
|
|
3728
|
+
reasoningEffort,
|
|
3673
3729
|
reasoningSummary: "detailed"
|
|
3674
3730
|
}
|
|
3675
3731
|
},
|
|
@@ -3682,10 +3738,17 @@ async function executeWithAiSdk(context) {
|
|
|
3682
3738
|
}
|
|
3683
3739
|
}
|
|
3684
3740
|
};
|
|
3741
|
+
const { providerOptions: extraProviderOptions, ...topLevelExtras } = configExtras;
|
|
3742
|
+
const mergedProviderOptions = {
|
|
3743
|
+
...extraProviderOptions && typeof extraProviderOptions === "object" ? extraProviderOptions : {},
|
|
3744
|
+
...computedProviderOpts
|
|
3745
|
+
};
|
|
3685
3746
|
const stepTimestamps = [];
|
|
3747
|
+
const effectiveMaxTurns = cfg.maxTurns === 0 ? void 0 : cfg.maxTurns ?? DEFAULT_MAX_TOOL_STEPS;
|
|
3686
3748
|
const { triggerPromptImages } = context;
|
|
3687
3749
|
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3688
|
-
const
|
|
3750
|
+
const result = await (0, import_ai.generateText)({
|
|
3751
|
+
...topLevelExtras,
|
|
3689
3752
|
model,
|
|
3690
3753
|
system: systemPrompt,
|
|
3691
3754
|
...hasImages ? {
|
|
@@ -3703,11 +3766,16 @@ async function executeWithAiSdk(context) {
|
|
|
3703
3766
|
}
|
|
3704
3767
|
]
|
|
3705
3768
|
} : { prompt: scenario.triggerPrompt },
|
|
3706
|
-
temperature: supportsThinking ? void 0 :
|
|
3707
|
-
|
|
3769
|
+
temperature: supportsThinking ? void 0 : cfg.temperature,
|
|
3770
|
+
topP: supportsThinking ? void 0 : cfg.topP,
|
|
3771
|
+
frequencyPenalty: cfg.frequencyPenalty,
|
|
3772
|
+
presencePenalty: cfg.presencePenalty,
|
|
3773
|
+
seed: cfg.seed,
|
|
3774
|
+
stopSequences: cfg.stopSequences,
|
|
3775
|
+
maxOutputTokens: cfg.maxTokens,
|
|
3708
3776
|
tools: mcpTools,
|
|
3709
|
-
|
|
3710
|
-
providerOptions:
|
|
3777
|
+
...mcpTools && effectiveMaxTurns != null ? { stopWhen: (0, import_ai.stepCountIs)(effectiveMaxTurns) } : !mcpTools ? { stopWhen: (0, import_ai.stepCountIs)(1) } : {},
|
|
3778
|
+
providerOptions: mergedProviderOptions,
|
|
3711
3779
|
onStepFinish: (step) => {
|
|
3712
3780
|
stepTimestamps.push(Date.now());
|
|
3713
3781
|
if (traceContext) {
|
|
@@ -3735,8 +3803,7 @@ async function executeWithAiSdk(context) {
|
|
|
3735
3803
|
);
|
|
3736
3804
|
}
|
|
3737
3805
|
}
|
|
3738
|
-
};
|
|
3739
|
-
const result = await (0, import_ai.generateText)(generateTextParams);
|
|
3806
|
+
});
|
|
3740
3807
|
const durationMs = Date.now() - startTime;
|
|
3741
3808
|
const usage = {
|
|
3742
3809
|
inputTokens: result.usage.inputTokens ?? 0,
|
|
@@ -3747,7 +3814,7 @@ async function executeWithAiSdk(context) {
|
|
|
3747
3814
|
result.steps,
|
|
3748
3815
|
durationMs,
|
|
3749
3816
|
usage,
|
|
3750
|
-
|
|
3817
|
+
modelId,
|
|
3751
3818
|
provider,
|
|
3752
3819
|
startTime,
|
|
3753
3820
|
stepTimestamps
|
|
@@ -4660,11 +4727,12 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4660
4727
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
4661
4728
|
const targetId = evalData.evalRun.presetId ?? agent?.id ?? evalData.evalRun.id;
|
|
4662
4729
|
const targetName = evalData.presetName || agent?.name || "";
|
|
4730
|
+
const agentConfig = agent?.config;
|
|
4663
4731
|
const executionContext = {
|
|
4664
4732
|
skills: evalData.skills,
|
|
4665
4733
|
scenario,
|
|
4666
4734
|
cwd: workDir || process.cwd(),
|
|
4667
|
-
|
|
4735
|
+
config: agentConfig,
|
|
4668
4736
|
aiGatewayUrl: config.aiGatewayUrl,
|
|
4669
4737
|
aiGatewayHeaders: config.aiGatewayHeaders,
|
|
4670
4738
|
traceContext: {
|
|
@@ -4699,7 +4767,18 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4699
4767
|
infrastructurePaths
|
|
4700
4768
|
);
|
|
4701
4769
|
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot, infrastructurePaths) : void 0;
|
|
4702
|
-
const
|
|
4770
|
+
const snapshotModelConfig = agentConfig?.model ? {
|
|
4771
|
+
model: agentConfig.model,
|
|
4772
|
+
...agentConfig.temperature != null && {
|
|
4773
|
+
temperature: agentConfig.temperature
|
|
4774
|
+
},
|
|
4775
|
+
...agentConfig.maxTokens != null && {
|
|
4776
|
+
maxTokens: agentConfig.maxTokens
|
|
4777
|
+
},
|
|
4778
|
+
...agentConfig.maxTurns != null && {
|
|
4779
|
+
maxTurns: agentConfig.maxTurns
|
|
4780
|
+
}
|
|
4781
|
+
} : agent?.modelConfig ?? (llmTrace?.summary.modelsUsed?.[0] ? { model: llmTrace.summary.modelsUsed[0] } : void 0);
|
|
4703
4782
|
return {
|
|
4704
4783
|
id: (0, import_crypto4.randomUUID)(),
|
|
4705
4784
|
targetId,
|
|
@@ -4707,7 +4786,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4707
4786
|
scenarioId: scenario.id,
|
|
4708
4787
|
scenarioName: scenario.name,
|
|
4709
4788
|
triggerPrompt: scenario.triggerPrompt,
|
|
4710
|
-
modelConfig:
|
|
4789
|
+
modelConfig: snapshotModelConfig,
|
|
4711
4790
|
duration: durationMs,
|
|
4712
4791
|
outputText,
|
|
4713
4792
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|