@wix/evalforge-evaluator 0.139.0 → 0.141.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -287,13 +287,16 @@ function resolveSystemAssertion(assertionId, params) {
287
287
  }
288
288
  };
289
289
  break;
290
- case "build_passed":
290
+ case "build_passed": {
291
+ const rawCmd = params?.command;
292
+ const command = typeof rawCmd === "string" && (0, import_evalforge_types.isAllowedBuildCommandString)(rawCmd) ? rawCmd.trim() : void 0;
291
293
  baseAssertion = {
292
294
  type: "build_passed",
293
- command: params?.command ?? void 0,
295
+ ...command !== void 0 && { command },
294
296
  expectedExitCode: params?.expectedExitCode ?? void 0
295
297
  };
296
298
  break;
299
+ }
297
300
  case "time_limit":
298
301
  baseAssertion = {
299
302
  type: "time_limit",
@@ -972,6 +975,26 @@ function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
972
975
 
973
976
  // src/run-scenario/agents/claude-code/execute.ts
974
977
  var DEFAULT_MODEL = import_evalforge_types4.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
978
+ async function* buildPromptStream(triggerPrompt, images) {
979
+ yield {
980
+ type: "user",
981
+ message: {
982
+ role: "user",
983
+ content: [
984
+ { type: "text", text: triggerPrompt },
985
+ ...images.map((img) => ({
986
+ type: "image",
987
+ source: {
988
+ type: "base64",
989
+ data: img.base64,
990
+ media_type: img.mediaType
991
+ }
992
+ }))
993
+ ]
994
+ },
995
+ parent_tool_use_id: null
996
+ };
997
+ }
975
998
  function extractToolActionDescription(toolName, toolArgs) {
976
999
  if (!toolName) {
977
1000
  return "Using tool...";
@@ -1347,8 +1370,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
1347
1370
  }, HEARTBEAT_INTERVAL_MS);
1348
1371
  }
1349
1372
  const sdkPromise = (async () => {
1373
+ const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
1374
+ const prompt = hasImages ? buildPromptStream(
1375
+ scenario.triggerPrompt,
1376
+ scenario.triggerPromptImages
1377
+ ) : scenario.triggerPrompt;
1350
1378
  for await (const message of query({
1351
- prompt: scenario.triggerPrompt,
1379
+ prompt,
1352
1380
  options: queryOptions
1353
1381
  })) {
1354
1382
  messageCount++;
@@ -2651,6 +2679,21 @@ function extractToolAction(toolName, args) {
2651
2679
  }
2652
2680
  return `Using ${toolName}...`;
2653
2681
  }
2682
+ async function writePromptImages(cwd, images) {
2683
+ const imagesDir = (0, import_path10.join)(cwd, "prompt-images");
2684
+ await (0, import_promises9.mkdir)(imagesDir, { recursive: true });
2685
+ const filePaths = [];
2686
+ for (let i = 0; i < images.length; i++) {
2687
+ const img = images[i];
2688
+ const ext = img.mediaType.split("/")[1] || "png";
2689
+ const filename = `image-${i}.${ext}`;
2690
+ const filepath = (0, import_path10.join)(imagesDir, filename);
2691
+ const buffer = Buffer.from(img.base64, "base64");
2692
+ await (0, import_promises9.writeFile)(filepath, buffer);
2693
+ filePaths.push(filepath);
2694
+ }
2695
+ return filePaths;
2696
+ }
2654
2697
  function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
2655
2698
  const base = {
2656
2699
  evalRunId: context.evalRunId,
@@ -3080,6 +3123,17 @@ async function executeWithOpenCode(skills, scenario, options) {
3080
3123
  "--dir",
3081
3124
  options.cwd
3082
3125
  ];
3126
+ const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
3127
+ const imageFileArgs = [];
3128
+ if (hasImages) {
3129
+ const imagePaths = await writePromptImages(
3130
+ options.cwd,
3131
+ scenario.triggerPromptImages
3132
+ );
3133
+ for (const imgPath of imagePaths) {
3134
+ imageFileArgs.push("-f", imgPath);
3135
+ }
3136
+ }
3083
3137
  const accumulatedEvents = [];
3084
3138
  let traceStepNumber = 0;
3085
3139
  let lastAttemptResult;
@@ -3114,7 +3168,7 @@ async function executeWithOpenCode(skills, scenario, options) {
3114
3168
  );
3115
3169
  }
3116
3170
  }
3117
- const args = [...baseArgs, prompt];
3171
+ const args = [...baseArgs, ...imageFileArgs, prompt];
3118
3172
  console.log(
3119
3173
  `[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
3120
3174
  args.slice(0, 5)
@@ -3597,10 +3651,26 @@ async function executeWithAiSdk(context) {
3597
3651
  }
3598
3652
  };
3599
3653
  const stepTimestamps = [];
3600
- const result = await (0, import_ai.generateText)({
3654
+ const { triggerPromptImages } = context;
3655
+ const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
3656
+ const generateTextParams = {
3601
3657
  model,
3602
3658
  system: systemPrompt,
3603
- prompt: scenario.triggerPrompt,
3659
+ ...hasImages ? {
3660
+ messages: [
3661
+ {
3662
+ role: "user",
3663
+ content: [
3664
+ { type: "text", text: scenario.triggerPrompt },
3665
+ ...triggerPromptImages.map((img) => ({
3666
+ type: "image",
3667
+ image: img.base64,
3668
+ mediaType: img.mediaType
3669
+ }))
3670
+ ]
3671
+ }
3672
+ ]
3673
+ } : { prompt: scenario.triggerPrompt },
3604
3674
  temperature: supportsThinking ? void 0 : modelConfig.temperature,
3605
3675
  maxOutputTokens: modelConfig.maxTokens,
3606
3676
  tools: mcpTools,
@@ -3633,7 +3703,8 @@ async function executeWithAiSdk(context) {
3633
3703
  );
3634
3704
  }
3635
3705
  }
3636
- });
3706
+ };
3707
+ const result = await (0, import_ai.generateText)(generateTextParams);
3637
3708
  const durationMs = Date.now() - startTime;
3638
3709
  const usage = {
3639
3710
  inputTokens: result.usage.inputTokens ?? 0,
@@ -4577,7 +4648,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4577
4648
  mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
4578
4649
  subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
4579
4650
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
4580
- systemPrompt: agent?.systemPrompt
4651
+ systemPrompt: agent?.systemPrompt,
4652
+ triggerPromptImages: scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0 ? scenario.triggerPromptImages : void 0
4581
4653
  };
4582
4654
  const hasPrepare = !!adapter.prepareEnvironment;
4583
4655
  const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};