npm - @wix/evalforge-evaluator - Versions diffs - 0.132.0 → 0.134.0 - Mend

@wix/evalforge-evaluator 0.132.0 → 0.134.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/build/index.js +102 -62
package/build/index.js.map +4 -4
package/build/index.mjs +102 -62
package/build/index.mjs.map +4 -4
package/build/types/evaluation-loop.d.ts +23 -0
package/package.json +5 -5

package/build/index.mjs CHANGED Viewed

@@ -3474,9 +3474,10 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
   });
   for (let i = 0; i < steps.length; i++) {
     const step = steps[i];
-    const stepTimestamp = new Date(
-      stepTimestamps[i] ?? executionStartMs
-    ).toISOString();
+    const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
+    const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
+    const assistantTimestamp = new Date(stepStartedAt).toISOString();
+    const toolResultTimestamp = new Date(stepFinishedAt).toISOString();
     const assistantContent = [];
     if (step.reasoningText) {
       assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3496,7 +3497,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
       messages.push({
         role: "assistant",
         content: assistantContent,
-        timestamp: stepTimestamp
+        timestamp: assistantTimestamp
       });
     }
     if (step.toolResults.length > 0) {
@@ -3513,7 +3514,7 @@ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestam
       messages.push({
         role: "user",
         content: resultBlocks,
-        timestamp: stepTimestamp
+        timestamp: toolResultTimestamp
       });
     }
   }
@@ -4686,6 +4687,57 @@ async function runScenario(config, evalRunId2, scenario, evalData, template, res
   };
 }
+// src/evaluation-loop.ts
+import { randomUUID as randomUUID5 } from "crypto";
+async function runEvaluationLoop(scenarioItems, evalData, callbacks) {
+  const runsPerScenario = evalData.evalRun.runsPerScenario ?? 1;
+  let completedExecutions = 0;
+  const totalExecutions = scenarioItems.length * runsPerScenario;
+  const targetId = evalData.evalRun.presetId ?? evalData.agent?.id ?? evalData.evalRun.id;
+  const targetName = evalData.presetName ?? evalData.agent?.name ?? "";
+  for (const { scenario, template, resolvedAssertions } of scenarioItems) {
+    for (let iterationIndex = 0; iterationIndex < runsPerScenario; iterationIndex++) {
+      const iterLabel = runsPerScenario > 1 ? ` [run ${iterationIndex + 1}/${runsPerScenario}]` : "";
+      callbacks.onProgress(completedExecutions + 1, totalExecutions, iterLabel);
+      let scenarioResult = null;
+      try {
+        scenarioResult = await callbacks.runScenario(
+          scenario,
+          template,
+          resolvedAssertions
+        );
+      } catch (err) {
+        const errorMsg = err instanceof Error ? err.message : String(err);
+        console.error(
+          `[Evaluator] Scenario iteration failed, recording as error result: "${scenario.name}"${iterLabel} \u2014 ${errorMsg}`
+        );
+        const errorResult = {
+          id: randomUUID5(),
+          targetId,
+          targetName,
+          scenarioId: scenario.id,
+          scenarioName: scenario.name,
+          assertionResults: [],
+          passed: 0,
+          failed: 0,
+          passRate: 0,
+          duration: 0,
+          outputText: `Execution error: ${errorMsg}`,
+          startedAt: (/* @__PURE__ */ new Date()).toISOString(),
+          completedAt: (/* @__PURE__ */ new Date()).toISOString(),
+          iterationIndex
+        };
+        await callbacks.addResult(errorResult);
+      }
+      if (scenarioResult !== null) {
+        await callbacks.addResult({ ...scenarioResult, iterationIndex });
+      }
+      completedExecutions++;
+    }
+  }
+  return { completedExecutions, totalExecutions };
+}
 // src/error-reporter.ts
 import { EvalStatus } from "@wix/evalforge-types";
 function formatError(error, phase, context) {
@@ -4866,69 +4918,57 @@ async function runEvaluation(projectId2, evalRunId2) {
       `[${ExecutionPhase.VALIDATION}] Eval run has no agent: set agentId for evaluation runs. (agentId: ${evalData.evalRun.agentId || "not set"})`
     );
   }
-  let completedScenarios = 0;
-  const totalScenarios = scenarioItems.length;
-  for (const { scenario, template, resolvedAssertions } of scenarioItems) {
-    state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
-    state.currentContext = {
-      projectId: projectId2,
-      evalRunId: evalRunId2,
-      scenarioId: scenario.id,
-      scenarioName: scenario.name,
-      presetId: evalData.evalRun.presetId,
-      presetName: evalData.presetName,
-      agentId: agent?.id,
-      agentName: agent?.name,
-      progress: `${completedScenarios + 1}/${totalScenarios}`
-    };
-    const skillNames = evalData.skills.map((s) => s.name).join(", ");
-    console.log(
-      "[Evaluator] Running scenario with preset:",
-      evalData.presetName,
-      skillNames ? `(${skillNames})` : "",
-      agent ? `with agent: ${agent.name}` : "",
-      `(${completedScenarios + 1}/${totalScenarios})`
-    );
-    try {
-      const result = await runScenario(
-        config,
-        evalRunId2,
-        scenario,
-        evalData,
-        template,
-        resolvedAssertions
-      );
-      console.log("[Evaluator] Scenario completed, adding result");
-      state.currentPhase = ExecutionPhase.ADD_RESULT;
-      state.currentContext = {
-        ...state.currentContext,
-        resultId: result.id
-      };
-      await api.addResult(projectId2, evalRunId2, result);
-      completedScenarios++;
-    } catch (err) {
-      const errorMsg = err instanceof Error ? err.message : String(err);
-      const errorStack = err instanceof Error ? err.stack : void 0;
-      console.error(
-        "[Evaluator] Failed to run scenario with preset:",
-        evalData.presetName,
-        "Error:",
-        errorMsg
-      );
-      if (errorStack) {
-        console.error("[Evaluator] Stack trace:", errorStack);
+  const skillNames = evalData.skills.map((s) => s.name).join(", ");
+  const { completedExecutions, totalExecutions } = await runEvaluationLoop(
+    scenarioItems,
+    evalData,
+    {
+      runScenario: (scenario, template, resolvedAssertions) => {
+        state.currentPhase = ExecutionPhase.EXECUTE_SKILL;
+        state.currentContext = {
+          projectId: projectId2,
+          evalRunId: evalRunId2,
+          scenarioId: scenario.id,
+          scenarioName: scenario.name,
+          presetId: evalData.evalRun.presetId,
+          presetName: evalData.presetName,
+          agentId: agent?.id,
+          agentName: agent?.name
+        };
+        return runScenario(
+          config,
+          evalRunId2,
+          scenario,
+          evalData,
+          template,
+          resolvedAssertions
+        );
+      },
+      addResult: async (result) => {
+        state.currentPhase = ExecutionPhase.ADD_RESULT;
+        state.currentContext = {
+          ...state.currentContext,
+          resultId: result.id
+        };
+        await api.addResult(projectId2, evalRunId2, result);
+      },
+      onProgress: (completed, total, iterLabel) => {
+        console.log(
+          "[Evaluator] Running scenario with preset:",
+          evalData.presetName,
+          skillNames ? `(${skillNames})` : "",
+          agent ? `with agent: ${agent.name}` : "",
+          `(${completed}/${total})${iterLabel}`
+        );
       }
-      throw new Error(
-        `[${state.currentPhase}] Failed to execute preset "${evalData.presetName}" on scenario "${scenario.name}": ${errorMsg}`
-      );
     }
-  }
+  );
   state.currentPhase = ExecutionPhase.UPDATE_STATUS;
   state.currentContext = {
     projectId: projectId2,
     evalRunId: evalRunId2,
-    completedScenarios,
-    totalScenarios
+    completedExecutions,
+    totalExecutions
   };
   try {
     await api.updateEvalRun(projectId2, evalRunId2, {