@wix/evalforge-evaluator 0.107.0 → 0.109.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1769,6 +1769,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1769
1769
  let toolArgs;
1770
1770
  let outputPreview;
1771
1771
  let filePath;
1772
+ let thinking;
1772
1773
  for (const block of message.message.content) {
1773
1774
  if (block.type === "tool_use") {
1774
1775
  type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
@@ -1785,6 +1786,15 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1785
1786
  }
1786
1787
  } else if (block.type === "text") {
1787
1788
  outputPreview = block.text.slice(0, 500);
1789
+ if (!toolName) {
1790
+ type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
1791
+ }
1792
+ } else if (block.type === "thinking") {
1793
+ const thinkingBlock = block;
1794
+ thinking = thinkingBlock.thinking.slice(0, 500);
1795
+ if (!outputPreview && !toolName) {
1796
+ type = import_evalforge_types4.LiveTraceEventType.THINKING;
1797
+ }
1788
1798
  }
1789
1799
  }
1790
1800
  return {
@@ -1799,6 +1809,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1799
1809
  toolArgs,
1800
1810
  outputPreview,
1801
1811
  filePath,
1812
+ thinking,
1802
1813
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1803
1814
  isComplete
1804
1815
  };
@@ -2424,28 +2435,53 @@ function processMessages(timestampedMessages, startTime, endTime) {
2424
2435
  const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
2425
2436
  (a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
2426
2437
  );
2427
- for (let i = 0; i < sortedGroups.length; i++) {
2428
- const group = sortedGroups[i];
2429
- const lastMessage = group.messages[group.messages.length - 1];
2430
- const stepStartTime = group.firstReceivedAt;
2431
- const nextStepStartTime = i < sortedGroups.length - 1 ? sortedGroups[i + 1].firstReceivedAt : endTime;
2438
+ const mergedTurns = [];
2439
+ for (const group of sortedGroups) {
2440
+ const lastMsg = group.messages[group.messages.length - 1];
2441
+ const inputTokens = lastMsg.message.usage.input_tokens;
2442
+ const prev = mergedTurns[mergedTurns.length - 1];
2443
+ const prevLastMsg = prev?.groups[prev.groups.length - 1].messages.at(-1);
2444
+ const prevInputTokens = prevLastMsg?.message.usage.input_tokens;
2445
+ if (prev && prevInputTokens === inputTokens) {
2446
+ prev.groups.push(group);
2447
+ prev.lastReceivedAt = group.lastReceivedAt;
2448
+ } else {
2449
+ mergedTurns.push({
2450
+ groups: [group],
2451
+ firstReceivedAt: group.firstReceivedAt,
2452
+ lastReceivedAt: group.lastReceivedAt
2453
+ });
2454
+ }
2455
+ }
2456
+ for (let i = 0; i < mergedTurns.length; i++) {
2457
+ const turn = mergedTurns[i];
2458
+ const stepStartTime = turn.firstReceivedAt;
2459
+ const nextStepStartTime = i < mergedTurns.length - 1 ? mergedTurns[i + 1].firstReceivedAt : endTime;
2432
2460
  const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
2433
- const usage = lastMessage.message.usage;
2434
- const inputTokens = usage.input_tokens;
2435
- const outputTokens = usage.output_tokens;
2436
2461
  let text2 = "";
2437
2462
  let thinking = "";
2438
2463
  const toolCalls = [];
2439
- for (const block of lastMessage.message.content) {
2440
- if (block.type === "text") {
2441
- text2 += block.text;
2442
- } else if (block.type === "thinking") {
2443
- thinking += block.thinking;
2444
- } else if (block.type === "tool_use") {
2445
- toolCalls.push({
2446
- toolName: block.name,
2447
- args: block.input
2448
- });
2464
+ let lastStopReason = null;
2465
+ let inputTokens = 0;
2466
+ let outputTokens = 0;
2467
+ let cacheReadTokens;
2468
+ let cacheWriteTokens;
2469
+ for (const group of turn.groups) {
2470
+ const lastMessage = group.messages[group.messages.length - 1];
2471
+ lastStopReason = lastMessage.message.stop_reason;
2472
+ const msgUsage = lastMessage.message.usage;
2473
+ inputTokens = msgUsage.input_tokens;
2474
+ outputTokens = msgUsage.output_tokens;
2475
+ cacheReadTokens = msgUsage.cache_read_input_tokens ?? void 0;
2476
+ cacheWriteTokens = msgUsage.cache_creation_input_tokens ?? void 0;
2477
+ for (const block of lastMessage.message.content) {
2478
+ if (block.type === "text") {
2479
+ text2 += block.text;
2480
+ } else if (block.type === "thinking") {
2481
+ thinking += block.thinking;
2482
+ } else if (block.type === "tool_use") {
2483
+ toolCalls.push({ toolName: block.name, args: block.input });
2484
+ }
2449
2485
  }
2450
2486
  }
2451
2487
  steps.push({
@@ -2454,13 +2490,14 @@ function processMessages(timestampedMessages, startTime, endTime) {
2454
2490
  usage: {
2455
2491
  inputTokens,
2456
2492
  outputTokens,
2457
- totalTokens: inputTokens + outputTokens
2493
+ totalTokens: inputTokens + outputTokens,
2494
+ cacheReadTokens: cacheReadTokens || void 0,
2495
+ cacheWriteTokens: cacheWriteTokens || void 0
2458
2496
  },
2459
- finishReason: mapStopReason(lastMessage.message.stop_reason),
2497
+ finishReason: mapStopReason(lastStopReason),
2460
2498
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
2461
2499
  startedAt: stepStartTime,
2462
2500
  durationMs: Math.max(0, durationMs)
2463
- // Ensure non-negative
2464
2501
  });
2465
2502
  }
2466
2503
  return { steps, result };
@@ -2500,35 +2537,35 @@ function extractTotalUsage(result) {
2500
2537
  inputTokens: usage.input_tokens,
2501
2538
  outputTokens: usage.output_tokens,
2502
2539
  totalTokens: usage.input_tokens + usage.output_tokens,
2503
- costUsd: result.total_cost_usd
2540
+ costUsd: result.total_cost_usd,
2541
+ cacheReadTokens: usage.cache_read_input_tokens || void 0,
2542
+ cacheWriteTokens: usage.cache_creation_input_tokens || void 0,
2543
+ durationApiMs: result.duration_api_ms || void 0
2504
2544
  };
2505
2545
  }
2506
2546
  function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2507
2547
  const totalCost = usage.costUsd ?? 0;
2508
- const totalStepTokens = steps.reduce(
2509
- (sum, s) => sum + s.usage.totalTokens,
2548
+ const totalStepInputTokens = steps.reduce(
2549
+ (sum, s) => sum + s.usage.inputTokens,
2510
2550
  0
2511
2551
  );
2512
2552
  const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
2513
- const hasPerStepTokens = totalStepTokens > 0;
2553
+ const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
2514
2554
  const traceSteps = steps.map((step, index) => {
2515
- let tokenUsage;
2555
+ let stepPromptTokens;
2556
+ let stepOutputTokens;
2516
2557
  let proportion;
2517
- if (hasPerStepTokens) {
2518
- proportion = step.usage.totalTokens / totalStepTokens;
2519
- tokenUsage = {
2520
- prompt: step.usage.inputTokens,
2521
- completion: step.usage.outputTokens,
2522
- total: step.usage.totalTokens
2523
- };
2524
- } else {
2558
+ if (inputTokensDuplicated) {
2525
2559
  proportion = totalStepDuration > 0 ? step.durationMs / totalStepDuration : 0;
2526
- tokenUsage = {
2527
- prompt: Math.round(usage.inputTokens * proportion),
2528
- completion: Math.round(usage.outputTokens * proportion),
2529
- total: Math.round(usage.totalTokens * proportion)
2530
- };
2560
+ stepPromptTokens = Math.round(usage.inputTokens * proportion);
2561
+ stepOutputTokens = Math.round(usage.outputTokens * proportion);
2562
+ } else {
2563
+ proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
2564
+ stepPromptTokens = step.usage.inputTokens;
2565
+ stepOutputTokens = Math.round(usage.outputTokens * proportion);
2531
2566
  }
2567
+ const stepTotalTokens = stepPromptTokens + stepOutputTokens;
2568
+ const costProportion = proportion;
2532
2569
  const stepType = step.toolCalls?.length ? import_evalforge_types4.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
2533
2570
  return {
2534
2571
  id: (0, import_crypto.randomUUID)(),
@@ -2538,11 +2575,15 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2538
2575
  provider: "anthropic",
2539
2576
  startedAt: step.startedAt.toISOString(),
2540
2577
  durationMs: step.durationMs,
2541
- tokenUsage,
2542
- costUsd: totalCost * proportion,
2578
+ tokenUsage: {
2579
+ prompt: stepPromptTokens,
2580
+ completion: stepOutputTokens,
2581
+ total: stepTotalTokens
2582
+ },
2583
+ costUsd: totalCost * costProportion,
2543
2584
  toolName: step.toolCalls?.[0]?.toolName,
2544
2585
  toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
2545
- outputPreview: step.text?.slice(0, 200),
2586
+ outputPreview: (step.text || step.thinking)?.slice(0, 200),
2546
2587
  success: step.finishReason !== "error",
2547
2588
  error: step.finishReason === "error" ? "Generation failed" : void 0
2548
2589
  };
@@ -2552,21 +2593,35 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2552
2593
  completion: usage.outputTokens,
2553
2594
  total: usage.totalTokens
2554
2595
  };
2555
- const finalCost = totalCost;
2596
+ const stepTypeBreakdown = {};
2597
+ for (const ts of traceSteps) {
2598
+ const entry = stepTypeBreakdown[ts.type] ?? {
2599
+ count: 0,
2600
+ durationMs: 0,
2601
+ tokens: 0,
2602
+ costUsd: 0
2603
+ };
2604
+ entry.count += 1;
2605
+ entry.durationMs += ts.durationMs;
2606
+ entry.tokens += ts.tokenUsage.total;
2607
+ entry.costUsd += ts.costUsd;
2608
+ stepTypeBreakdown[ts.type] = entry;
2609
+ }
2556
2610
  const summary = {
2557
2611
  totalSteps: traceSteps.length,
2558
2612
  totalDurationMs,
2559
2613
  totalTokens: finalTokens,
2560
- totalCostUsd: finalCost,
2614
+ totalCostUsd: totalCost,
2561
2615
  modelBreakdown: {
2562
2616
  [model]: {
2563
2617
  count: traceSteps.length,
2564
2618
  durationMs: totalDurationMs,
2565
2619
  tokens: finalTokens.total,
2566
- costUsd: finalCost
2620
+ costUsd: totalCost
2567
2621
  }
2568
2622
  },
2569
- modelsUsed: [model]
2623
+ modelsUsed: [model],
2624
+ stepTypeBreakdown
2570
2625
  };
2571
2626
  return {
2572
2627
  id: (0, import_crypto.randomUUID)(),
@@ -2606,6 +2661,7 @@ var ClaudeCodeAdapter = class {
2606
2661
  model: modelForSdk,
2607
2662
  temperature: modelConfig?.temperature,
2608
2663
  maxTokens: modelConfig?.maxTokens,
2664
+ maxTurns: modelConfig?.maxTurns,
2609
2665
  aiGatewayUrl,
2610
2666
  aiGatewayHeaders,
2611
2667
  traceContext,
@@ -42638,7 +42694,6 @@ var MODEL_PRICING = {
42638
42694
  CLAUDE_4_OPUS_1_0: { input: 15, output: 75 },
42639
42695
  CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
42640
42696
  // Anthropic — Claude 3.x
42641
- CLAUDE_3_7_SONNET_1_0: { input: 3, output: 15 },
42642
42697
  CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
42643
42698
  CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
42644
42699
  CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
@@ -42756,7 +42811,7 @@ function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
42756
42811
  // src/run-scenario/agents/simple-agent/execute.ts
42757
42812
  var PROVIDER_ANTHROPIC2 = "anthropic";
42758
42813
  var PROVIDER_OPENAI = "openai";
42759
- var MAX_TOOL_STEPS = 25;
42814
+ var DEFAULT_MAX_TOOL_STEPS = 25;
42760
42815
  function createModel(modelId, baseUrl, headers) {
42761
42816
  const isClaudeModel = import_evalforge_types6.AVAILABLE_CLAUDE_MODEL_IDS.includes(
42762
42817
  modelId
@@ -42774,6 +42829,9 @@ function createModel(modelId, baseUrl, headers) {
42774
42829
  apiKey: "proxy-auth",
42775
42830
  headers
42776
42831
  });
42832
+ if (import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
42833
+ return openai2.responses(modelId);
42834
+ }
42777
42835
  return openai2.chat(modelId);
42778
42836
  }
42779
42837
  function isClaudeModelId(modelId) {
@@ -42810,14 +42868,34 @@ async function executeWithAiSdk(context2) {
42810
42868
  emitStartEvent(traceContext, startTime);
42811
42869
  }
42812
42870
  try {
42871
+ const isAnthropic = provider === PROVIDER_ANTHROPIC2;
42872
+ const isResponsesAPI = import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
42873
+ const supportsThinking = isAnthropic || isResponsesAPI;
42874
+ const providerOpts = {
42875
+ ...isAnthropic && {
42876
+ anthropic: {
42877
+ thinking: { type: "enabled", budgetTokens: 1e4 }
42878
+ }
42879
+ },
42880
+ ...isResponsesAPI && {
42881
+ openai: {
42882
+ // Prevent the SDK from sending item_reference inputs that the proxy can't forward
42883
+ store: false,
42884
+ forceReasoning: true,
42885
+ reasoningEffort: "high",
42886
+ reasoningSummary: "detailed"
42887
+ }
42888
+ }
42889
+ };
42813
42890
  const result = await generateText({
42814
42891
  model,
42815
42892
  system: systemPrompt,
42816
42893
  prompt: scenario.triggerPrompt,
42817
- temperature: modelConfig.temperature,
42894
+ temperature: supportsThinking ? void 0 : modelConfig.temperature,
42818
42895
  maxOutputTokens: modelConfig.maxTokens,
42819
42896
  tools: mcpTools,
42820
- stopWhen: mcpTools ? stepCountIs(MAX_TOOL_STEPS) : stepCountIs(1)
42897
+ stopWhen: mcpTools ? stepCountIs(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : stepCountIs(1),
42898
+ providerOptions: providerOpts
42821
42899
  });
42822
42900
  const durationMs = Date.now() - startTime;
42823
42901
  const usage = {