@wix/evalforge-evaluator 0.119.0 → 0.120.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -55,6 +55,7 @@ function loadConfig() {
55
55
  aiGatewayHeaders[key] = value;
56
56
  }
57
57
  }
58
+ aiGatewayHeaders["x-wix-ai-gateway-disable-cache"] = "true";
58
59
  const tracePushUrl = process.env.TRACE_PUSH_URL;
59
60
  const routeHeader = process.env.EVAL_ROUTE_HEADER;
60
61
  const authToken = process.env.EVAL_AUTH_TOKEN;
@@ -3404,7 +3405,7 @@ function calculateStepCost(step, modelId, provider, tokenUsage) {
3404
3405
  }
3405
3406
 
3406
3407
  // src/run-scenario/agents/simple-agent/build-conversation.ts
3407
- function buildConversation3(triggerPrompt, steps, executionStartMs) {
3408
+ function buildConversation3(triggerPrompt, steps, executionStartMs, stepTimestamps) {
3408
3409
  const messages = [];
3409
3410
  messages.push({
3410
3411
  role: "user",
@@ -3413,11 +3414,9 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3413
3414
  });
3414
3415
  for (let i = 0; i < steps.length; i++) {
3415
3416
  const step = steps[i];
3416
- const stepTimestamp = estimateStepTimestamp(
3417
- executionStartMs,
3418
- i,
3419
- steps.length
3420
- );
3417
+ const stepTimestamp = new Date(
3418
+ stepTimestamps[i] ?? executionStartMs
3419
+ ).toISOString();
3421
3420
  const assistantContent = [];
3422
3421
  if (step.reasoningText) {
3423
3422
  assistantContent.push({ type: "thinking", thinking: step.reasoningText });
@@ -3460,10 +3459,6 @@ function buildConversation3(triggerPrompt, steps, executionStartMs) {
3460
3459
  }
3461
3460
  return messages;
3462
3461
  }
3463
- function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
3464
- const offset = totalSteps > 1 ? (stepIndex + 1) / totalSteps : 1;
3465
- return new Date(startMs + Math.round(offset * 1e3)).toISOString();
3466
- }
3467
3462
 
3468
3463
  // src/run-scenario/agents/simple-agent/execute.ts
3469
3464
  var PROVIDER_ANTHROPIC2 = "anthropic";
@@ -3548,6 +3543,7 @@ async function executeWithAiSdk(context) {
3548
3543
  }
3549
3544
  }
3550
3545
  };
3546
+ const stepTimestamps = [];
3551
3547
  const result = await (0, import_ai.generateText)({
3552
3548
  model,
3553
3549
  system: systemPrompt,
@@ -3556,7 +3552,34 @@ async function executeWithAiSdk(context) {
3556
3552
  maxOutputTokens: modelConfig.maxTokens,
3557
3553
  tools: mcpTools,
3558
3554
  stopWhen: mcpTools ? (0, import_ai.stepCountIs)(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : (0, import_ai.stepCountIs)(1),
3559
- providerOptions: providerOpts
3555
+ providerOptions: providerOpts,
3556
+ onStepFinish: (step) => {
3557
+ stepTimestamps.push(Date.now());
3558
+ if (traceContext) {
3559
+ const isToolStep = step.toolCalls.length > 0;
3560
+ const firstToolCall = step.toolCalls[0];
3561
+ emitTraceEvent(
3562
+ {
3563
+ evalRunId: traceContext.evalRunId,
3564
+ scenarioId: traceContext.scenarioId,
3565
+ scenarioName: traceContext.scenarioName,
3566
+ targetId: traceContext.targetId,
3567
+ targetName: traceContext.targetName,
3568
+ stepNumber: stepTimestamps.length,
3569
+ type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
3570
+ toolName: firstToolCall?.toolName,
3571
+ toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3572
+ outputPreview: step.text?.slice(0, 500),
3573
+ elapsedMs: Date.now() - startTime,
3574
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3575
+ isComplete: false
3576
+ },
3577
+ traceContext.tracePushUrl,
3578
+ traceContext.routeHeader,
3579
+ traceContext.authToken
3580
+ );
3581
+ }
3582
+ }
3560
3583
  });
3561
3584
  const durationMs = Date.now() - startTime;
3562
3585
  const usage = {
@@ -3570,16 +3593,17 @@ async function executeWithAiSdk(context) {
3570
3593
  usage,
3571
3594
  modelConfig.model,
3572
3595
  provider,
3573
- startTime
3596
+ startTime,
3597
+ stepTimestamps
3574
3598
  );
3575
3599
  if (traceContext) {
3576
- emitStepEvents(traceContext, result.steps, startTime);
3577
- emitCompletionEvent(traceContext, result.steps.length + 1);
3600
+ emitCompletionEvent(traceContext, stepTimestamps.length + 1);
3578
3601
  }
3579
3602
  const conversation = buildConversation3(
3580
3603
  scenario.triggerPrompt,
3581
3604
  result.steps,
3582
- startTime
3605
+ startTime,
3606
+ stepTimestamps
3583
3607
  );
3584
3608
  return {
3585
3609
  outputText: result.text,
@@ -3620,20 +3644,16 @@ function findToolResultError(step) {
3620
3644
  }
3621
3645
  return null;
3622
3646
  }
3623
- function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs) {
3624
- const totalStepTokens = steps.reduce(
3625
- (sum, s) => sum + (s.usage.totalTokens ?? 0),
3626
- 0
3627
- );
3647
+ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, executionStartMs, stepTimestamps) {
3628
3648
  const traceSteps = steps.map((step, i) => {
3629
- const stepTokens = step.usage.totalTokens ?? 0;
3630
- const proportion = totalStepTokens > 0 ? stepTokens / totalStepTokens : 0;
3631
- const stepDurationMs = Math.round(totalDurationMs * proportion);
3649
+ const stepFinishedAt = stepTimestamps[i] ?? executionStartMs;
3650
+ const stepStartedAt = i === 0 ? executionStartMs : stepTimestamps[i - 1] ?? executionStartMs;
3651
+ const stepDurationMs = stepFinishedAt - stepStartedAt;
3632
3652
  const firstToolCall = step.toolCalls[0];
3633
3653
  const tokenUsage = {
3634
3654
  prompt: step.usage.inputTokens ?? 0,
3635
3655
  completion: step.usage.outputTokens ?? 0,
3636
- total: stepTokens
3656
+ total: step.usage.totalTokens ?? 0
3637
3657
  };
3638
3658
  const costUsd = calculateStepCost(step, modelId, provider, tokenUsage);
3639
3659
  const toolResultError = findToolResultError(step);
@@ -3644,9 +3664,7 @@ function buildLLMTrace2(steps, totalDurationMs, totalUsage, modelId, provider, e
3644
3664
  type: step.toolCalls.length > 0 ? import_evalforge_types11.LLMStepType.TOOL_USE : import_evalforge_types11.LLMStepType.COMPLETION,
3645
3665
  model: modelId,
3646
3666
  provider,
3647
- startedAt: new Date(
3648
- executionStartMs + Math.round(totalDurationMs * (i / Math.max(steps.length, 1)))
3649
- ).toISOString(),
3667
+ startedAt: new Date(stepStartedAt).toISOString(),
3650
3668
  durationMs: stepDurationMs,
3651
3669
  tokenUsage,
3652
3670
  costUsd,
@@ -3704,33 +3722,6 @@ function emitStartEvent(traceContext, startTime) {
3704
3722
  traceContext.authToken
3705
3723
  );
3706
3724
  }
3707
- function emitStepEvents(traceContext, steps, startTime) {
3708
- for (let i = 0; i < steps.length; i++) {
3709
- const step = steps[i];
3710
- const isToolStep = step.toolCalls.length > 0;
3711
- const firstToolCall = step.toolCalls[0];
3712
- emitTraceEvent(
3713
- {
3714
- evalRunId: traceContext.evalRunId,
3715
- scenarioId: traceContext.scenarioId,
3716
- scenarioName: traceContext.scenarioName,
3717
- targetId: traceContext.targetId,
3718
- targetName: traceContext.targetName,
3719
- stepNumber: i + 1,
3720
- type: isToolStep ? import_evalforge_types11.LiveTraceEventType.TOOL_USE : import_evalforge_types11.LiveTraceEventType.COMPLETION,
3721
- toolName: firstToolCall?.toolName,
3722
- toolArgs: firstToolCall ? (JSON.stringify(firstToolCall.input) ?? "").slice(0, 500) : void 0,
3723
- outputPreview: step.text?.slice(0, 500),
3724
- elapsedMs: Date.now() - startTime,
3725
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3726
- isComplete: false
3727
- },
3728
- traceContext.tracePushUrl,
3729
- traceContext.routeHeader,
3730
- traceContext.authToken
3731
- );
3732
- }
3733
- }
3734
3725
  function emitCompletionEvent(traceContext, stepNumber) {
3735
3726
  emitTraceEvent(
3736
3727
  {