npm - @wix/evalforge-evaluator - Versions diffs - 0.119.0 → 0.121.0 - Mend

@wix/evalforge-evaluator 0.119.0 → 0.121.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/build/index.js +48 -144
package/build/index.js.map +2 -2
package/build/index.mjs +48 -144
package/build/index.mjs.map +3 -3
package/build/types/api-client.d.ts +1 -2
package/build/types/fetch-evaluation-data.d.ts +1 -5
package/build/types/run-scenario/agents/simple-agent/build-conversation.d.ts +1 -1
package/build/types/run-scenario/agents/simple-agent/execute.d.ts +1 -1
package/package.json +5 -5

package/build/index.js CHANGED Viewed

@@ -55,6 +55,7 @@ function loadConfig() {
       aiGatewayHeaders[key] = value;
     }
   }
+  aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
   const tracePushUrl = process.env.TRACE_PUSH_URL;
   const routeHeader = process.env.EVAL_ROUTE_HEADER;
   const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -185,9 +186,6 @@ function createApiClient(serverUrl, options = "") {
     getPreset(projectId2, id) {
       return fetchJson(`/projects/${projectId2}/presets/${id}`);
     },
-    getAssertion(projectId2, id) {
-      return fetchJson(`/projects/${projectId2}/assertions/${id}`);
-    },
     addResult(projectId2, evalRunId2, result) {
       return postJson(
         `/projects/${projectId2}/eval-runs/${evalRunId2}/results`,
@@ -249,9 +247,6 @@ function resolveValue(value, placeholders) {
   }
   return value;
 }
-function resolvePlaceholdersInString(text, placeholders) {
-  return resolveValue(text, placeholders);
-}
 // src/fetch-evaluation-data.ts
 function parseSkillNamesFromParams(value) {
@@ -264,59 +259,6 @@ function parseSkillNamesFromParams(value) {
   }
   return [];
 }
-function applyParamsToAssertion(assertion, params) {
-  if (!params || Object.keys(params).length === 0) {
-    return assertion;
-  }
-  if (assertion.type === "llm_judge") {
-    const stringParams = {};
-    for (const [key, value] of Object.entries(params)) {
-      stringParams[key] = String(value ?? "");
-    }
-    const prompt = resolvePlaceholdersInString(assertion.prompt, stringParams);
-    return {
-      ...assertion,
-      prompt,
-      ...params.model !== void 0 && { model: params.model },
-      ...params.maxTokens !== void 0 && {
-        maxTokens: params.maxTokens
-      },
-      ...params.temperature !== void 0 && {
-        temperature: params.temperature
-      },
-      ...params.minScore !== void 0 && {
-        minScore: params.minScore
-      }
-    };
-  }
-  if (assertion.type === "time_limit" && params.maxDurationMs !== void 0) {
-    return {
-      ...assertion,
-      maxDurationMs: params.maxDurationMs
-    };
-  }
-  if (assertion.type === "skill_was_called" && params.skillNames !== void 0) {
-    return {
-      ...assertion,
-      skillNames: parseSkillNamesFromParams(params.skillNames)
-    };
-  }
-  if (assertion.type === "tool_called_with_param") {
-    return {
-      ...assertion,
-      ...params.toolName !== void 0 && {
-        toolName: params.toolName
-      },
-      ...params.expectedParams !== void 0 && {
-        expectedParams: params.expectedParams
-      },
-      ...params.requireSuccess !== void 0 && {
-        requireSuccess: params.requireSuccess
-      }
-    };
-  }
-  return { ...assertion, ...params };
-}
 function resolveSystemAssertion(assertionId, params) {
   const systemAssertion = import_evalforge_types.SYSTEM_ASSERTIONS[assertionId];
   let baseAssertion;
@@ -371,18 +313,6 @@ function resolveSystemAssertion(assertionId, params) {
   }
   return baseAssertion;
 }
-function customAssertionToAssertion(ca, params) {
-  const config = ca.config;
-  const baseAssertion = {
-    type: "llm_judge",
-    prompt: config?.prompt ?? "",
-    minScore: config?.minScore,
-    model: config?.model,
-    maxTokens: config?.maxTokens,
-    temperature: config?.temperature
-  };
-  return applyParamsToAssertion(baseAssertion, params);
-}
 async function fetchEvaluationData(api, projectId2, evalRunId2) {
   const evalRun = await api.getEvalRun(projectId2, evalRunId2);
   const scenarios = await Promise.all(
@@ -452,30 +382,14 @@ async function fetchEvaluationData(api, projectId2, evalRunId2) {
     templateIds.map((id) => api.getTemplate(projectId2, id))
   ) : [];
   const templateMap = new Map(templates.map((t) => [t.id, t]));
-  const assertionIds = [
-    ...new Set(
-      scenarios.flatMap((s) => s.assertionLinks ?? []).map((link) => link.assertionId).filter((id) => !(0, import_evalforge_types.isSystemAssertionId)(id))
-    )
-  ];
-  const assertions = assertionIds.length > 0 ? await Promise.all(
-    assertionIds.map((id) => api.getAssertion(projectId2, id))
-  ) : [];
-  const assertionMap = new Map(assertions.map((a) => [a.id, a]));
   const scenarioItems = scenarios.map((scenario) => {
     const resolvedAssertions = (scenario.assertionLinks ?? []).map((link) => {
       const { assertionId, params } = link;
-      if ((0, import_evalforge_types.isSystemAssertionId)(assertionId)) {
-        return resolveSystemAssertion(
-          assertionId,
-          params
-        );
-      }
-      const customAssertion = assertionMap.get(assertionId);
-      if (!customAssertion) {
+      if (!(0, import_evalforge_types.isSystemAssertionId)(assertionId)) {
         return null;
       }
-      return customAssertionToAssertion(
-        customAssertion,
+      return resolveSystemAssertion(
+        assertionId,
         params
       );
     }).filter((a) => a !== null);
@@ -3404,7 +3318,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
 }
 // src/run-scenario/agents/simple-agent/build-conversation.ts
-function buildConversation3(triggerPrompt, steps, executionStartMs) {
+function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
   const messages = [];
   messages.push({
     role: "user",
@@ -3413,11 +3327,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
   });
   for (let i = 0; i < steps.length; i++) {
     const step = steps[i];
-    const stepTimestamp = estimateStepTimestamp(
-      executionStartMs,
-      i,
-      steps.length
-    );
+    const stepTimestamp = new Date(
+      stepTimestamps[i] ?? executionStartMs
+    ).toISOString();
     const assistantContent = [];
     if (step.reasoningText) {
       assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3460,10 +3372,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
   }
   return messages;
 }
-function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
-  const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
-  return new Date(startMs + Math.round(offset * 1e3)).toISOString();
-}
 // src/run-scenario/agents/simple-agent/execute.ts
 var PROVIDER_ANTHROPIC2 = "anthropic";
@@ -3548,6 +3456,7 @@ async function executeWithAiSdk(context) {
         }
       }
     };
+    const stepTimestamps = [];
     const result = await (0, import_ai.generateText)({
       model,
       system: systemPrompt,
@@ -3556,7 +3465,34 @@ async function executeWithAiSdk(context) {
       maxOutputTokens: modelConfig.maxTokens,
       tools: mcpTools,
       stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
-      providerOptions: providerOpts
+      providerOptions: providerOpts,
+      onStepFinish: (step) => {
+        stepTimestamps.push(Date.now());
+        if (traceContext) {
+          const isToolStep = step.toolCalls.length > 0;
+          const firstToolCall = step.toolCalls[0];
+          emitTraceEvent(
+            {
+              evalRunId: traceContext.evalRunId,
+              scenarioId: traceContext.scenarioId,
+              scenarioName: traceContext.scenarioName,
+              targetId: traceContext.targetId,
+              targetName: traceContext.targetName,
+              stepNumber: stepTimestamps.length,
+              type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
+              toolName: firstToolCall?.toolName,
+              toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
+              outputPreview: step.text?.slice(0, 500),
+              elapsedMs: Date.now() - startTime,
+              timestamp: (/* @__PURE__ */ new Date()).toISOString(),
+              isComplete: false
+            },
+            traceContext.tracePushUrl,
+            traceContext.routeHeader,
+            traceContext.authToken
+          );
+        }
+      }
     });
     const durationMs = Date.now() - startTime;
     const usage = {
@@ -3570,16 +3506,17 @@ async function executeWithAiSdk(context) {
       usage,
       modelConfig.model,
       provider,
-      startTime
+      startTime,
+      stepTimestamps
     );
     if (traceContext) {
-      emitStepEvents(traceContext, result.steps, startTime);
-      emitCompletionEvent(traceContext, result.steps.length + 1);
+      emitCompletionEvent(traceContext, stepTimestamps.length + 1);
     }
     const conversation = buildConversation3(
       scenario.triggerPrompt,
       result.steps,
-      startTime
+      startTime,
+      stepTimestamps
     );
     return {
       outputText: result.text,
@@ -3620,20 +3557,16 @@ function findToolResultError(step) {
   }
   return null;
 }
-function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
-  const totalStepTokens = steps.reduce(
-    (sum, s) => sum + (s.usage.totalTokens ?? 0),
-    0
-  );
+function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
   const traceSteps = steps.map((step, i) => {
-    const stepTokens = step.usage.totalTokens ?? 0;
-    const proportion = totalStepTokens > 0 ? stepTokens / totalStepTokens : 0;
-    const stepDurationMs = Math.round(totalDurationMs * proportion);
+    const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
+    const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
+    const stepDurationMs = stepFinishedAt - stepStartedAt;
     const firstToolCall = step.toolCalls[0];
     const tokenUsage = {
       prompt: step.usage.inputTokens ?? 0,
       completion: step.usage.outputTokens ?? 0,
-      total: stepTokens
+      total: step.usage.totalTokens ?? 0
     };
     const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
     const toolResultError = findToolResultError(step);
@@ -3644,9 +3577,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
       type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
       model: modelId,
       provider,
-      startedAt: new Date(
-        executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
-      ).toISOString(),
+      startedAt: new Date(stepStartedAt).toISOString(),
       durationMs: stepDurationMs,
       tokenUsage,
       costUsd,
@@ -3704,33 +3635,6 @@ function emitStartEvent(traceContext, startTime) {
     traceContext.authToken
   );
 }
-function emitStepEvents(traceContext, steps, startTime) {
-  for (let i = 0; i < steps.length; i++) {
-    const step = steps[i];
-    const isToolStep = step.toolCalls.length > 0;
-    const firstToolCall = step.toolCalls[0];
-    emitTraceEvent(
-      {
-        evalRunId: traceContext.evalRunId,
-        scenarioId: traceContext.scenarioId,
-        scenarioName: traceContext.scenarioName,
-        targetId: traceContext.targetId,
-        targetName: traceContext.targetName,
-        stepNumber: i + 1,
-        type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
-        toolName: firstToolCall?.toolName,
-        toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
-        outputPreview: step.text?.slice(0, 500),
-        elapsedMs: Date.now() - startTime,
-        timestamp: (/* @__PURE__ */ new Date()).toISOString(),
-        isComplete: false
-      },
-      traceContext.tracePushUrl,
-      traceContext.routeHeader,
-      traceContext.authToken
-    );
-  }
-}
 function emitCompletionEvent(traceContext, stepNumber) {
   emitTraceEvent(
     {