npm - @wix/evalforge-evaluator - Versions diffs - 0.139.0 → 0.141.0 - Mend

@wix/evalforge-evaluator 0.139.0 → 0.141.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/build/index.js +80 -8
package/build/index.js.map +2 -2
package/build/index.mjs +82 -9
package/build/index.mjs.map +2 -2
package/build/types/run-scenario/agents/claude-code/execute.d.ts +12 -2
package/build/types/run-scenario/agents/opencode/execute.d.ts +13 -1
package/package.json +5 -5

package/build/index.mjs CHANGED Viewed

@@ -183,7 +183,8 @@ function createApiClient(serverUrl, options = "") {
 // src/fetch-evaluation-data.ts
 import {
   isSystemAssertionId,
-  SYSTEM_ASSERTIONS
+  SYSTEM_ASSERTIONS,
+  isAllowedBuildCommandString
 } from "@wix/evalforge-types";
 // src/resolve-placeholders.ts
@@ -267,13 +268,16 @@ function resolveSystemAssertion(assertionId, params) {
         }
       };
       break;
-    case "build_passed":
+    case "build_passed": {
+      const rawCmd = params?.command;
+      const command = typeof rawCmd === "string" && isAllowedBuildCommandString(rawCmd) ? rawCmd.trim() : void 0;
       baseAssertion = {
         type: "build_passed",
-        command: params?.command ?? void 0,
+        ...command !== void 0 && { command },
         expectedExitCode: params?.expectedExitCode ?? void 0
       };
       break;
+    }
     case "time_limit":
       baseAssertion = {
         type: "time_limit",
@@ -964,6 +968,26 @@ function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
 // src/run-scenario/agents/claude-code/execute.ts
 var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
+async function* buildPromptStream(triggerPrompt, images) {
+  yield {
+    type: "user",
+    message: {
+      role: "user",
+      content: [
+        { type: "text", text: triggerPrompt },
+        ...images.map((img) => ({
+          type: "image",
+          source: {
+            type: "base64",
+            data: img.base64,
+            media_type: img.mediaType
+          }
+        }))
+      ]
+    },
+    parent_tool_use_id: null
+  };
+}
 function extractToolActionDescription(toolName, toolArgs) {
   if (!toolName) {
     return "Using tool...";
@@ -1339,8 +1363,13 @@ async function executeWithClaudeCode(skills, scenario, options) {
       }, HEARTBEAT_INTERVAL_MS);
     }
     const sdkPromise = (async () => {
+      const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
+      const prompt = hasImages ? buildPromptStream(
+        scenario.triggerPrompt,
+        scenario.triggerPromptImages
+      ) : scenario.triggerPrompt;
       for await (const message of query({
-        prompt: scenario.triggerPrompt,
+        prompt,
         options: queryOptions
       })) {
         messageCount++;
@@ -2651,6 +2680,21 @@ function extractToolAction(toolName, args) {
   }
   return `Using ${toolName}...`;
 }
+async function writePromptImages(cwd, images) {
+  const imagesDir = join8(cwd, "prompt-images");
+  await mkdir7(imagesDir, { recursive: true });
+  const filePaths = [];
+  for (let i = 0; i < images.length; i++) {
+    const img = images[i];
+    const ext = img.mediaType.split("/")[1] || "png";
+    const filename = `image-${i}.${ext}`;
+    const filepath = join8(imagesDir, filename);
+    const buffer = Buffer.from(img.base64, "base64");
+    await writeFile6(filepath, buffer);
+    filePaths.push(filepath);
+  }
+  return filePaths;
+}
 function createTraceEventFromNdjson(evt, context, stepNumber, isComplete) {
   const base = {
     evalRunId: context.evalRunId,
@@ -3080,6 +3124,17 @@ async function executeWithOpenCode(skills, scenario, options) {
     "--dir",
     options.cwd
   ];
+  const hasImages = scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0;
+  const imageFileArgs = [];
+  if (hasImages) {
+    const imagePaths = await writePromptImages(
+      options.cwd,
+      scenario.triggerPromptImages
+    );
+    for (const imgPath of imagePaths) {
+      imageFileArgs.push("-f", imgPath);
+    }
+  }
   const accumulatedEvents = [];
   let traceStepNumber = 0;
   let lastAttemptResult;
@@ -3114,7 +3169,7 @@ async function executeWithOpenCode(skills, scenario, options) {
         );
       }
     }
-    const args = [...baseArgs, prompt];
+    const args = [...baseArgs, ...imageFileArgs, prompt];
     console.log(
       `[executeWithOpenCode] Spawning attempt ${attempt}: opencode`,
       args.slice(0, 5)
@@ -3605,10 +3660,26 @@ async function executeWithAiSdk(context) {
       }
     };
     const stepTimestamps = [];
-    const result = await generateText({
+    const { triggerPromptImages } = context;
+    const hasImages = triggerPromptImages && triggerPromptImages.length > 0;
+    const generateTextParams = {
       model,
       system: systemPrompt,
-      prompt: scenario.triggerPrompt,
+      ...hasImages ? {
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: scenario.triggerPrompt },
+              ...triggerPromptImages.map((img) => ({
+                type: "image",
+                image: img.base64,
+                mediaType: img.mediaType
+              }))
+            ]
+          }
+        ]
+      } : { prompt: scenario.triggerPrompt },
       temperature: supportsThinking ? void 0 : modelConfig.temperature,
       maxOutputTokens: modelConfig.maxTokens,
       tools: mcpTools,
@@ -3641,7 +3712,8 @@ async function executeWithAiSdk(context) {
           );
         }
       }
-    });
+    };
+    const result = await generateText(generateTextParams);
     const durationMs = Date.now() - startTime;
     const usage = {
       inputTokens: result.usage.inputTokens ?? 0,
@@ -4585,7 +4657,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
     mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
     subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
     rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
-    systemPrompt: agent?.systemPrompt
+    systemPrompt: agent?.systemPrompt,
+    triggerPromptImages: scenario.triggerPromptImages && scenario.triggerPromptImages.length > 0 ? scenario.triggerPromptImages : void 0
   };
   const hasPrepare = !!adapter.prepareEnvironment;
   const prePrepSnapshot = hasPrepare && workDir ? snapshotDirectory(workDir) : {};