@wix/evalforge-evaluator 0.139.0 → 0.141.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -183,7 +183,8 @@ function createApiClient(serverUrl, options = "") {
183
183
  // src/fetch-evaluation-data.ts
184
184
  import {
185
185
  isSystemAssertionId,
186
- SYSTEM_ASSERTIONS
186
+ SYSTEM_ASSERTIONS,
187
+ isAllowedBuildCommandString
187
188
  } from "@wix/evalforge-types";
188
189
 
189
190
  // src/resolve-placeholders.ts
@@ -267,13 +268,16 @@ function resolveSystemAssertion(assertionId, params) {
267
268
  }
268
269
  };
269
270
  break;
270
- case "build_passed":
271
+ case "build_passed": {
272
+ const rawCmd = params?.command;
273
+ const command = typeof rawCmd === "string" && isAllowedBuildCommandString(rawCmd) ? rawCmd.trim() : void 0;
271
274
  baseAssertion = {
272
275
  type: "build_passed",
273
- command: params?.command ?? void 0,
276
+ ...command !== void 0 && { command },
274
277
  expectedExitCode: params?.expectedExitCode ?? void 0
275
278
  };
276
279
  break;
280
+ }
277
281
  case "time_limit":
278
282
  baseAssertion = {
279
283
  type: "time_limit",
@@ -964,6 +968,26 @@ function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
964
968
 
965
969
  // src/run-scenario/agents/claude-code/execute.ts
966
970
  var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
971
+ async function* buildPromptStream(triggerPrompt, images) {
972
+ yield {
973
+ type: "user",
974
+ message: {
975
+ role: "user",
976
+ content: [
977
+ { type: "text", text: triggerPrompt },
978
+ ...images.map((img) => ({
979
+ type: "image",
980
+ source: {
981
+ type: "base64",
982
+ data: img.base64,
983
+ media_type: img.mediaType
984
+ }
985
+ }))
986
+ ]
987
+ },
988
+ parent_tool_use_id: null
989
+ };
990
+ }
967
991
  function extractToolActionDescription(toolName, toolArgs) {
968
992
  if (!toolName) {
969
993
  return "Using tool...";
@@ -1339,8 +1363,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
1339
1363
  }, HEARTBEAT_INTERVAL_MS);
1340
1364
  }
1341
1365
  const sdkPromise = (async () => {
1366
+ const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
1367
+ const prompt = hasImages ? buildPromptStream(
1368
+ scenario.triggerPrompt,
1369
+ scenario.triggerPromptImages
1370
+ ) : scenario.triggerPrompt;
1342
1371
  for await (const message of query({
1343
- prompt: scenario.triggerPrompt,
1372
+ prompt,
1344
1373
  options: queryOptions
1345
1374
  })) {
1346
1375
  messageCount++;
@@ -2651,6 +2680,21 @@ function extractToolAction(toolName, args) {
2651
2680
  }
2652
2681
  return `Using ${toolName}...`;
2653
2682
  }
2683
+ async function writePromptImages(cwd, images) {
2684
+ const imagesDir = join8(cwd, "prompt-images");
2685
+ await mkdir7(imagesDir, { recursive: true });
2686
+ const filePaths = [];
2687
+ for (let i = 0; i < images.length; i++) {
2688
+ const img = images[i];
2689
+ const ext = img.mediaType.split("/")[1] || "png";
2690
+ const filename = `image-${i}.${ext}`;
2691
+ const filepath = join8(imagesDir, filename);
2692
+ const buffer = Buffer.from(img.base64, "base64");
2693
+ await writeFile6(filepath, buffer);
2694
+ filePaths.push(filepath);
2695
+ }
2696
+ return filePaths;
2697
+ }
2654
2698
  function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
2655
2699
  const base = {
2656
2700
  evalRunId: context.evalRunId,
@@ -3080,6 +3124,17 @@ async function executeWithOpenCode(skills, scenario, options) {
3080
3124
  "--dir",
3081
3125
  options.cwd
3082
3126
  ];
3127
+ const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
3128
+ const imageFileArgs = [];
3129
+ if (hasImages) {
3130
+ const imagePaths = await writePromptImages(
3131
+ options.cwd,
3132
+ scenario.triggerPromptImages
3133
+ );
3134
+ for (const imgPath of imagePaths) {
3135
+ imageFileArgs.push("-f", imgPath);
3136
+ }
3137
+ }
3083
3138
  const accumulatedEvents = [];
3084
3139
  let traceStepNumber = 0;
3085
3140
  let lastAttemptResult;
@@ -3114,7 +3169,7 @@ async function executeWithOpenCode(skills, scenario, options) {
3114
3169
  );
3115
3170
  }
3116
3171
  }
3117
- const args = [...baseArgs, prompt];
3172
+ const args = [...baseArgs, ...imageFileArgs, prompt];
3118
3173
  console.log(
3119
3174
  `[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
3120
3175
  args.slice(0, 5)
@@ -3605,10 +3660,26 @@ async function executeWithAiSdk(context) {
3605
3660
  }
3606
3661
  };
3607
3662
  const stepTimestamps = [];
3608
- const result = await generateText({
3663
+ const { triggerPromptImages } = context;
3664
+ const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
3665
+ const generateTextParams = {
3609
3666
  model,
3610
3667
  system: systemPrompt,
3611
- prompt: scenario.triggerPrompt,
3668
+ ...hasImages ? {
3669
+ messages: [
3670
+ {
3671
+ role: "user",
3672
+ content: [
3673
+ { type: "text", text: scenario.triggerPrompt },
3674
+ ...triggerPromptImages.map((img) => ({
3675
+ type: "image",
3676
+ image: img.base64,
3677
+ mediaType: img.mediaType
3678
+ }))
3679
+ ]
3680
+ }
3681
+ ]
3682
+ } : { prompt: scenario.triggerPrompt },
3612
3683
  temperature: supportsThinking ? void 0 : modelConfig.temperature,
3613
3684
  maxOutputTokens: modelConfig.maxTokens,
3614
3685
  tools: mcpTools,
@@ -3641,7 +3712,8 @@ async function executeWithAiSdk(context) {
3641
3712
  );
3642
3713
  }
3643
3714
  }
3644
- });
3715
+ };
3716
+ const result = await generateText(generateTextParams);
3645
3717
  const durationMs = Date.now() - startTime;
3646
3718
  const usage = {
3647
3719
  inputTokens: result.usage.inputTokens ?? 0,
@@ -4585,7 +4657,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
4585
4657
  mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
4586
4658
  subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
4587
4659
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
4588
- systemPrompt: agent?.systemPrompt
4660
+ systemPrompt: agent?.systemPrompt,
4661
+ triggerPromptImages: scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0 ? scenario.triggerPromptImages : void 0
4589
4662
  };
4590
4663
  const hasPrepare = !!adapter.prepareEnvironment;
4591
4664
  const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};