@wix/evalforge-evaluator 0.139.0 → 0.141.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs
CHANGED
|
@@ -183,7 +183,8 @@ function createApiClient(serverUrl, options = "") {
|
|
|
183
183
|
// src/fetch-evaluation-data.ts
|
|
184
184
|
import {
|
|
185
185
|
isSystemAssertionId,
|
|
186
|
-
SYSTEM_ASSERTIONS
|
|
186
|
+
SYSTEM_ASSERTIONS,
|
|
187
|
+
isAllowedBuildCommandString
|
|
187
188
|
} from "@wix/evalforge-types";
|
|
188
189
|
|
|
189
190
|
// src/resolve-placeholders.ts
|
|
@@ -267,13 +268,16 @@ function resolveSystemAssertion(assertionId, params) {
|
|
|
267
268
|
}
|
|
268
269
|
};
|
|
269
270
|
break;
|
|
270
|
-
case "build_passed":
|
|
271
|
+
case "build_passed": {
|
|
272
|
+
const rawCmd = params?.command;
|
|
273
|
+
const command = typeof rawCmd === "string" && isAllowedBuildCommandString(rawCmd) ? rawCmd.trim() : void 0;
|
|
271
274
|
baseAssertion = {
|
|
272
275
|
type: "build_passed",
|
|
273
|
-
command
|
|
276
|
+
...command !== void 0 && { command },
|
|
274
277
|
expectedExitCode: params?.expectedExitCode ?? void 0
|
|
275
278
|
};
|
|
276
279
|
break;
|
|
280
|
+
}
|
|
277
281
|
case "time_limit":
|
|
278
282
|
baseAssertion = {
|
|
279
283
|
type: "time_limit",
|
|
@@ -964,6 +968,26 @@ function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
|
964
968
|
|
|
965
969
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
966
970
|
var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
971
|
+
async function* buildPromptStream(triggerPrompt, images) {
|
|
972
|
+
yield {
|
|
973
|
+
type: "user",
|
|
974
|
+
message: {
|
|
975
|
+
role: "user",
|
|
976
|
+
content: [
|
|
977
|
+
{ type: "text", text: triggerPrompt },
|
|
978
|
+
...images.map((img) => ({
|
|
979
|
+
type: "image",
|
|
980
|
+
source: {
|
|
981
|
+
type: "base64",
|
|
982
|
+
data: img.base64,
|
|
983
|
+
media_type: img.mediaType
|
|
984
|
+
}
|
|
985
|
+
}))
|
|
986
|
+
]
|
|
987
|
+
},
|
|
988
|
+
parent_tool_use_id: null
|
|
989
|
+
};
|
|
990
|
+
}
|
|
967
991
|
function extractToolActionDescription(toolName, toolArgs) {
|
|
968
992
|
if (!toolName) {
|
|
969
993
|
return "Using tool...";
|
|
@@ -1339,8 +1363,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1339
1363
|
}, HEARTBEAT_INTERVAL_MS);
|
|
1340
1364
|
}
|
|
1341
1365
|
const sdkPromise = (async () => {
|
|
1366
|
+
const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
|
|
1367
|
+
const prompt = hasImages ? buildPromptStream(
|
|
1368
|
+
scenario.triggerPrompt,
|
|
1369
|
+
scenario.triggerPromptImages
|
|
1370
|
+
) : scenario.triggerPrompt;
|
|
1342
1371
|
for await (const message of query({
|
|
1343
|
-
prompt
|
|
1372
|
+
prompt,
|
|
1344
1373
|
options: queryOptions
|
|
1345
1374
|
})) {
|
|
1346
1375
|
messageCount++;
|
|
@@ -2651,6 +2680,21 @@ function extractToolAction(toolName, args) {
|
|
|
2651
2680
|
}
|
|
2652
2681
|
return `Using ${toolName}...`;
|
|
2653
2682
|
}
|
|
2683
|
+
async function writePromptImages(cwd, images) {
|
|
2684
|
+
const imagesDir = join8(cwd, "prompt-images");
|
|
2685
|
+
await mkdir7(imagesDir, { recursive: true });
|
|
2686
|
+
const filePaths = [];
|
|
2687
|
+
for (let i = 0; i < images.length; i++) {
|
|
2688
|
+
const img = images[i];
|
|
2689
|
+
const ext = img.mediaType.split("/")[1] || "png";
|
|
2690
|
+
const filename = `image-${i}.${ext}`;
|
|
2691
|
+
const filepath = join8(imagesDir, filename);
|
|
2692
|
+
const buffer = Buffer.from(img.base64, "base64");
|
|
2693
|
+
await writeFile6(filepath, buffer);
|
|
2694
|
+
filePaths.push(filepath);
|
|
2695
|
+
}
|
|
2696
|
+
return filePaths;
|
|
2697
|
+
}
|
|
2654
2698
|
function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
|
|
2655
2699
|
const base = {
|
|
2656
2700
|
evalRunId: context.evalRunId,
|
|
@@ -3080,6 +3124,17 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3080
3124
|
"--dir",
|
|
3081
3125
|
options.cwd
|
|
3082
3126
|
];
|
|
3127
|
+
const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
|
|
3128
|
+
const imageFileArgs = [];
|
|
3129
|
+
if (hasImages) {
|
|
3130
|
+
const imagePaths = await writePromptImages(
|
|
3131
|
+
options.cwd,
|
|
3132
|
+
scenario.triggerPromptImages
|
|
3133
|
+
);
|
|
3134
|
+
for (const imgPath of imagePaths) {
|
|
3135
|
+
imageFileArgs.push("-f", imgPath);
|
|
3136
|
+
}
|
|
3137
|
+
}
|
|
3083
3138
|
const accumulatedEvents = [];
|
|
3084
3139
|
let traceStepNumber = 0;
|
|
3085
3140
|
let lastAttemptResult;
|
|
@@ -3114,7 +3169,7 @@ async function executeWithOpenCode(skills, scenario, options) {
|
|
|
3114
3169
|
);
|
|
3115
3170
|
}
|
|
3116
3171
|
}
|
|
3117
|
-
const args = [...baseArgs, prompt];
|
|
3172
|
+
const args = [...baseArgs, ...imageFileArgs, prompt];
|
|
3118
3173
|
console.log(
|
|
3119
3174
|
`[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
|
|
3120
3175
|
args.slice(0, 5)
|
|
@@ -3605,10 +3660,26 @@ async function executeWithAiSdk(context) {
|
|
|
3605
3660
|
}
|
|
3606
3661
|
};
|
|
3607
3662
|
const stepTimestamps = [];
|
|
3608
|
-
const
|
|
3663
|
+
const { triggerPromptImages } = context;
|
|
3664
|
+
const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
|
|
3665
|
+
const generateTextParams = {
|
|
3609
3666
|
model,
|
|
3610
3667
|
system: systemPrompt,
|
|
3611
|
-
|
|
3668
|
+
...hasImages ? {
|
|
3669
|
+
messages: [
|
|
3670
|
+
{
|
|
3671
|
+
role: "user",
|
|
3672
|
+
content: [
|
|
3673
|
+
{ type: "text", text: scenario.triggerPrompt },
|
|
3674
|
+
...triggerPromptImages.map((img) => ({
|
|
3675
|
+
type: "image",
|
|
3676
|
+
image: img.base64,
|
|
3677
|
+
mediaType: img.mediaType
|
|
3678
|
+
}))
|
|
3679
|
+
]
|
|
3680
|
+
}
|
|
3681
|
+
]
|
|
3682
|
+
} : { prompt: scenario.triggerPrompt },
|
|
3612
3683
|
temperature: supportsThinking ? void 0 : modelConfig.temperature,
|
|
3613
3684
|
maxOutputTokens: modelConfig.maxTokens,
|
|
3614
3685
|
tools: mcpTools,
|
|
@@ -3641,7 +3712,8 @@ async function executeWithAiSdk(context) {
|
|
|
3641
3712
|
);
|
|
3642
3713
|
}
|
|
3643
3714
|
}
|
|
3644
|
-
}
|
|
3715
|
+
};
|
|
3716
|
+
const result = await generateText(generateTextParams);
|
|
3645
3717
|
const durationMs = Date.now() - startTime;
|
|
3646
3718
|
const usage = {
|
|
3647
3719
|
inputTokens: result.usage.inputTokens ?? 0,
|
|
@@ -4585,7 +4657,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
4585
4657
|
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
4586
4658
|
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
|
|
4587
4659
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
4588
|
-
systemPrompt: agent?.systemPrompt
|
|
4660
|
+
systemPrompt: agent?.systemPrompt,
|
|
4661
|
+
triggerPromptImages: scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0 ? scenario.triggerPromptImages : void 0
|
|
4589
4662
|
};
|
|
4590
4663
|
const hasPrepare = !!adapter.prepareEnvironment;
|
|
4591
4664
|
const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};
|