@wix/evalforge-evaluator 0.139.0 → 0.141.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js
CHANGED
|
@@ -287,13 +287,16 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
287
287
|
}
|
|
288
288
|
};
|
|
289
289
|
break;
|
|
290
|
-
case "build_passed":
|
|
290
|
+
case "build_passed": {
|
|
291
|
+
const rawCmd = params?.command;
|
|
292
|
+
const command = typeof rawCmd === "string" && (0, import_evalforge_types.isAllowedBuildCommandString)(rawCmd) ? rawCmd.trim() : void 0;
|
|
291
293
|
baseAssertion = {
|
|
292
294
|
type: "build_passed",
|
|
293
|
-
command
|
|
295
|
+
...command !== void 0 && { command },
|
|
294
296
|
expectedExitCode: params?.expectedExitCode ?? void 0
|
|
295
297
|
};
|
|
296
298
|
break;
|
|
299
|
+
}
|
|
297
300
|
case "time_limit":
|
|
298
301
|
baseAssertion = {
|
|
299
302
|
type: "time_limit",
|
|
@@ -972,6 +975,26 @@ function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
|
972
975
|
|
|
973
976
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
974
977
|
var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
978
|
+
async function* buildPromptStream(triggerPrompt, images) {
|
|
979
|
+
yield {
|
|
980
|
+
type: "user",
|
|
981
|
+
message: {
|
|
982
|
+
role: "user",
|
|
983
|
+
content: [
|
|
984
|
+
{ type: "text", text: triggerPrompt },
|
|
985
|
+
...images.map((img) => ({
|
|
986
|
+
type: "image",
|
|
987
|
+
source: {
|
|
988
|
+
type: "base64",
|
|
989
|
+
data: img.base64,
|
|
990
|
+
media_type: img.mediaType
|
|
991
|
+
}
|
|
992
|
+
}))
|
|
993
|
+
]
|
|
994
|
+
},
|
|
995
|
+
parent_tool_use_id: null
|
|
996
|
+
};
|
|
997
|
+
}
|
|
975
998
|
function extractToolActionDescription(toolName, toolArgs) {
|
|
976
999
|
if (!toolName) {
|
|
977
1000
|
return "Using tool...";
|
|
@@ -1347,8 +1370,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1347
1370
|
}, HEARTBEAT_INTERVAL_MS);
|
|
1348
1371
|
}
|
|
1349
1372
|
const sdkPromise = (async () => {
|
|
1373
|
+
const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
|
|
1374
|
+
const prompt = hasImages ? buildPromptStream(
|
|
1375
|
+
scenario.triggerPrompt,
|
|
1376
|
+
scenario.triggerPromptImages
|
|
1377
|
+
) : scenario.triggerPrompt;
|
|
1350
1378
|
for await (const message of query({
|
|
1351
|
-
prompt
|
|
1379
|
+
prompt,
|
|
1352
1380
|
options: queryOptions
|
|
1353
1381
|
})) {
|
|
1354
1382
|
messageCount++;
|
|
@@ -2651,6 +2679,21 @@ function extractToolAction(toolName, args) {
|
|
|
2651
2679
|
}
|
|
2652
2680
|
return `Using ${toolName}...`;
|
|
2653
2681
|
}
|
|
2682
|
+
async function writePromptImages(cwd, images) {
|
|
2683
|
+
const imagesDir = (0, import_path10.join)(cwd, "prompt-images");
|
|
2684
|
+
await (0, import_promises9.mkdir)(imagesDir, { recursive: true });
|
|
2685
|
+
const filePaths = [];
|
|
2686
|
+
for (let i = 0; i < images.length; i++) {
|
|
2687
|
+
const img = images[i];
|
|
2688
|
+
const ext = img.mediaType.split("/")[1] || "png";
|
|
2689
|
+
const filename = `image-${i}.${ext}`;
|
|
2690
|
+
const filepath = (0, import_path10.join)(imagesDir, filename);
|
|
2691
|
+
const buffer = Buffer.from(img.base64, "base64");
|
|
2692
|
+
await (0, import_promises9.writeFile)(filepath, buffer);
|
|
2693
|
+
filePaths.push(filepath);
|
|
2694
|
+
}
|
|
2695
|
+
return filePaths;
|
|
2696
|
+
}
|
|
2654
2697
|
function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
2655
2698
|
const base = {
|
|
2656
2699
|
evalRunId: context.evalRunId,
|
|
@@ -3080,6 +3123,17 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3080
3123
|
"--dir",
|
|
3081
3124
|
options.cwd
|
|
3082
3125
|
];
|
|
3126
|
+
const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
|
|
3127
|
+
const imageFileArgs = [];
|
|
3128
|
+
if (hasImages) {
|
|
3129
|
+
const imagePaths = await writePromptImages(
|
|
3130
|
+
options.cwd,
|
|
3131
|
+
scenario.triggerPromptImages
|
|
3132
|
+
);
|
|
3133
|
+
for (const imgPath of imagePaths) {
|
|
3134
|
+
imageFileArgs.push("-f", imgPath);
|
|
3135
|
+
}
|
|
3136
|
+
}
|
|
3083
3137
|
const accumulatedEvents = [];
|
|
3084
3138
|
let traceStepNumber = 0;
|
|
3085
3139
|
let lastAttemptResult;
|
|
@@ -3114,7 +3168,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3114
3168
|
);
|
|
3115
3169
|
}
|
|
3116
3170
|
}
|
|
3117
|
-
const args = [...baseArgs, prompt];
|
|
3171
|
+
const args = [...baseArgs, ...imageFileArgs, prompt];
|
|
3118
3172
|
console.log(
|
|
3119
3173
|
`[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
|
|
3120
3174
|
args.slice(0, 5)
|
|
@@ -3597,10 +3651,26 @@ async function executeWithAiSdk(context) {
|
|
|
3597
3651
|
}
|
|
3598
3652
|
};
|
|
3599
3653
|
const stepTimestamps = [];
|
|
3600
|
-
const
|
|
3654
|
+
const { triggerPromptImages } = context;
|
|
3655
|
+
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3656
|
+
const generateTextParams = {
|
|
3601
3657
|
model,
|
|
3602
3658
|
system: systemPrompt,
|
|
3603
|
-
|
|
3659
|
+
...hasImages ? {
|
|
3660
|
+
messages: [
|
|
3661
|
+
{
|
|
3662
|
+
role: "user",
|
|
3663
|
+
content: [
|
|
3664
|
+
{ type: "text", text: scenario.triggerPrompt },
|
|
3665
|
+
...triggerPromptImages.map((img) => ({
|
|
3666
|
+
type: "image",
|
|
3667
|
+
image: img.base64,
|
|
3668
|
+
mediaType: img.mediaType
|
|
3669
|
+
}))
|
|
3670
|
+
]
|
|
3671
|
+
}
|
|
3672
|
+
]
|
|
3673
|
+
} : { prompt: scenario.triggerPrompt },
|
|
3604
3674
|
temperature: supportsThinking ? void 0 : modelConfig.temperature,
|
|
3605
3675
|
maxOutputTokens: modelConfig.maxTokens,
|
|
3606
3676
|
tools: mcpTools,
|
|
@@ -3633,7 +3703,8 @@ async function executeWithAiSdk(context) {
|
|
|
3633
3703
|
);
|
|
3634
3704
|
}
|
|
3635
3705
|
}
|
|
3636
|
-
}
|
|
3706
|
+
};
|
|
3707
|
+
const result = await (0, import_ai.generateText)(generateTextParams);
|
|
3637
3708
|
const durationMs = Date.now() - startTime;
|
|
3638
3709
|
const usage = {
|
|
3639
3710
|
inputTokens: result.usage.inputTokens ?? 0,
|
|
@@ -4577,7 +4648,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4577
4648
|
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
4578
4649
|
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
|
|
4579
4650
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
4580
|
-
systemPrompt: agent?.systemPrompt
|
|
4651
|
+
systemPrompt: agent?.systemPrompt,
|
|
4652
|
+
triggerPromptImages: scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0 ? scenario.triggerPromptImages : void 0
|
|
4581
4653
|
};
|
|
4582
4654
|
const hasPrepare = !!adapter.prepareEnvironment;
|
|
4583
4655
|
const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
|