npm - @wix/evalforge-evaluator - Versions diffs - 0.119.0 → 0.120.0 - Mend

@wix/evalforge-evaluator 0.119.0 → 0.120.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/build/index.js +45 -54
package/build/index.js.map +2 -2
package/build/index.mjs +45 -54
package/build/index.mjs.map +3 -3
package/build/types/run-scenario/agents/simple-agent/build-conversation.d.ts +1 -1
package/build/types/run-scenario/agents/simple-agent/execute.d.ts +1 -1
package/package.json +2 -2

package/build/index.js CHANGED Viewed

@@ -55,6 +55,7 @@ function loadConfig() {
       aiGatewayHeaders[key] = value;
     }
   }
+  aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
   const tracePushUrl = process.env.TRACE_PUSH_URL;
   const routeHeader = process.env.EVAL_ROUTE_HEADER;
   const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -3404,7 +3405,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
 }
 // src/run-scenario/agents/simple-agent/build-conversation.ts
-function buildConversation3(triggerPrompt, steps, executionStartMs) {
+function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
   const messages = [];
   messages.push({
     role: "user",
@@ -3413,11 +3414,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
   });
   for (let i = 0; i < steps.length; i++) {
     const step = steps[i];
-    const stepTimestamp = estimateStepTimestamp(
-      executionStartMs,
-      i,
-      steps.length
-    );
+    const stepTimestamp = new Date(
+      stepTimestamps[i] ?? executionStartMs
+    ).toISOString();
     const assistantContent = [];
     if (step.reasoningText) {
       assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3460,10 +3459,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
   }
   return messages;
 }
-function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
-  const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
-  return new Date(startMs + Math.round(offset * 1e3)).toISOString();
-}
 // src/run-scenario/agents/simple-agent/execute.ts
 var PROVIDER_ANTHROPIC2 = "anthropic";
@@ -3548,6 +3543,7 @@ async function executeWithAiSdk(context) {
         }
       }
     };
+    const stepTimestamps = [];
     const result = await (0, import_ai.generateText)({
       model,
       system: systemPrompt,
@@ -3556,7 +3552,34 @@ async function executeWithAiSdk(context) {
       maxOutputTokens: modelConfig.maxTokens,
       tools: mcpTools,
       stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
-      providerOptions: providerOpts
+      providerOptions: providerOpts,
+      onStepFinish: (step) => {
+        stepTimestamps.push(Date.now());
+        if (traceContext) {
+          const isToolStep = step.toolCalls.length > 0;
+          const firstToolCall = step.toolCalls[0];
+          emitTraceEvent(
+            {
+              evalRunId: traceContext.evalRunId,
+              scenarioId: traceContext.scenarioId,
+              scenarioName: traceContext.scenarioName,
+              targetId: traceContext.targetId,
+              targetName: traceContext.targetName,
+              stepNumber: stepTimestamps.length,
+              type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
+              toolName: firstToolCall?.toolName,
+              toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
+              outputPreview: step.text?.slice(0, 500),
+              elapsedMs: Date.now() - startTime,
+              timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+              isComplete: false
+            },
+            traceContext.tracePushUrl,
+            traceContext.routeHeader,
+            traceContext.authToken
+          );
+        }
+      }
     });
     const durationMs = Date.now() - startTime;
     const usage = {
@@ -3570,16 +3593,17 @@ async function executeWithAiSdk(context) {
       usage,
       modelConfig.model,
       provider,
-      startTime
+      startTime,
+      stepTimestamps
     );
     if (traceContext) {
-      emitStepEvents(traceContext, result.steps, startTime);
-      emitCompletionEvent(traceContext, result.steps.length + 1);
+      emitCompletionEvent(traceContext, stepTimestamps.length + 1);
     }
     const conversation = buildConversation3(
       scenario.triggerPrompt,
       result.steps,
-      startTime
+      startTime,
+      stepTimestamps
     );
     return {
       outputText: result.text,
@@ -3620,20 +3644,16 @@ function findToolResultError(step) {
   }
   return null;
 }
-function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
-  const totalStepTokens = steps.reduce(
-    (sum, s) => sum + (s.usage.totalTokens ?? 0),
-    0
-  );
+function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
   const traceSteps = steps.map((step, i) => {
-    const stepTokens = step.usage.totalTokens ?? 0;
-    const proportion = totalStepTokens > 0 ? stepTokens / totalStepTokens : 0;
-    const stepDurationMs = Math.round(totalDurationMs * proportion);
+    const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
+    const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
+    const stepDurationMs = stepFinishedAt - stepStartedAt;
     const firstToolCall = step.toolCalls[0];
     const tokenUsage = {
       prompt: step.usage.inputTokens ?? 0,
       completion: step.usage.outputTokens ?? 0,
-      total: stepTokens
+      total: step.usage.totalTokens ?? 0
     };
     const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
     const toolResultError = findToolResultError(step);
@@ -3644,9 +3664,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
       type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
       model: modelId,
       provider,
-      startedAt: new Date(
-        executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
-      ).toISOString(),
+      startedAt: new Date(stepStartedAt).toISOString(),
       durationMs: stepDurationMs,
       tokenUsage,
       costUsd,
@@ -3704,33 +3722,6 @@ function emitStartEvent(traceContext, startTime) {
     traceContext.authToken
   );
 }
-function emitStepEvents(traceContext, steps, startTime) {
-  for (let i = 0; i < steps.length; i++) {
-    const step = steps[i];
-    const isToolStep = step.toolCalls.length > 0;
-    const firstToolCall = step.toolCalls[0];
-    emitTraceEvent(
-      {
-        evalRunId: traceContext.evalRunId,
-        scenarioId: traceContext.scenarioId,
-        scenarioName: traceContext.scenarioName,
-        targetId: traceContext.targetId,
-        targetName: traceContext.targetName,
-        stepNumber: i + 1,
-        type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
-        toolName: firstToolCall?.toolName,
-        toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
-        outputPreview: step.text?.slice(0, 500),
-        elapsedMs: Date.now() - startTime,
-        timestamp: (/* @__PURE__ */ new Date()).toISOString(),
-        isComplete: false
-      },
-      traceContext.tracePushUrl,
-      traceContext.routeHeader,
-      traceContext.authToken
-    );
-  }
-}
 function emitCompletionEvent(traceContext, stepNumber) {
   emitTraceEvent(
     {