@wix/evalforge-evaluator 0.107.0 → 0.109.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -1789,6 +1789,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1789
1789
  let toolArgs;
1790
1790
  let outputPreview;
1791
1791
  let filePath;
1792
+ let thinking;
1792
1793
  for (const block of message.message.content) {
1793
1794
  if (block.type === "tool_use") {
1794
1795
  type = LiveTraceEventType.TOOL_USE;
@@ -1805,6 +1806,15 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1805
1806
  }
1806
1807
  } else if (block.type === "text") {
1807
1808
  outputPreview = block.text.slice(0, 500);
1809
+ if (!toolName) {
1810
+ type = LiveTraceEventType.COMPLETION;
1811
+ }
1812
+ } else if (block.type === "thinking") {
1813
+ const thinkingBlock = block;
1814
+ thinking = thinkingBlock.thinking.slice(0, 500);
1815
+ if (!outputPreview && !toolName) {
1816
+ type = LiveTraceEventType.THINKING;
1817
+ }
1808
1818
  }
1809
1819
  }
1810
1820
  return {
@@ -1819,6 +1829,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1819
1829
  toolArgs,
1820
1830
  outputPreview,
1821
1831
  filePath,
1832
+ thinking,
1822
1833
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1823
1834
  isComplete
1824
1835
  };
@@ -2444,28 +2455,53 @@ function processMessages(timestampedMessages, startTime, endTime) {
2444
2455
  const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
2445
2456
  (a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
2446
2457
  );
2447
- for (let i = 0; i < sortedGroups.length; i++) {
2448
- const group = sortedGroups[i];
2449
- const lastMessage = group.messages[group.messages.length - 1];
2450
- const stepStartTime = group.firstReceivedAt;
2451
- const nextStepStartTime = i < sortedGroups.length - 1 ? sortedGroups[i + 1].firstReceivedAt : endTime;
2458
+ const mergedTurns = [];
2459
+ for (const group of sortedGroups) {
2460
+ const lastMsg = group.messages[group.messages.length - 1];
2461
+ const inputTokens = lastMsg.message.usage.input_tokens;
2462
+ const prev = mergedTurns[mergedTurns.length - 1];
2463
+ const prevLastMsg = prev?.groups[prev.groups.length - 1].messages.at(-1);
2464
+ const prevInputTokens = prevLastMsg?.message.usage.input_tokens;
2465
+ if (prev && prevInputTokens === inputTokens) {
2466
+ prev.groups.push(group);
2467
+ prev.lastReceivedAt = group.lastReceivedAt;
2468
+ } else {
2469
+ mergedTurns.push({
2470
+ groups: [group],
2471
+ firstReceivedAt: group.firstReceivedAt,
2472
+ lastReceivedAt: group.lastReceivedAt
2473
+ });
2474
+ }
2475
+ }
2476
+ for (let i = 0; i < mergedTurns.length; i++) {
2477
+ const turn = mergedTurns[i];
2478
+ const stepStartTime = turn.firstReceivedAt;
2479
+ const nextStepStartTime = i < mergedTurns.length - 1 ? mergedTurns[i + 1].firstReceivedAt : endTime;
2452
2480
  const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
2453
- const usage = lastMessage.message.usage;
2454
- const inputTokens = usage.input_tokens;
2455
- const outputTokens = usage.output_tokens;
2456
2481
  let text2 = "";
2457
2482
  let thinking = "";
2458
2483
  const toolCalls = [];
2459
- for (const block of lastMessage.message.content) {
2460
- if (block.type === "text") {
2461
- text2 += block.text;
2462
- } else if (block.type === "thinking") {
2463
- thinking += block.thinking;
2464
- } else if (block.type === "tool_use") {
2465
- toolCalls.push({
2466
- toolName: block.name,
2467
- args: block.input
2468
- });
2484
+ let lastStopReason = null;
2485
+ let inputTokens = 0;
2486
+ let outputTokens = 0;
2487
+ let cacheReadTokens;
2488
+ let cacheWriteTokens;
2489
+ for (const group of turn.groups) {
2490
+ const lastMessage = group.messages[group.messages.length - 1];
2491
+ lastStopReason = lastMessage.message.stop_reason;
2492
+ const msgUsage = lastMessage.message.usage;
2493
+ inputTokens = msgUsage.input_tokens;
2494
+ outputTokens = msgUsage.output_tokens;
2495
+ cacheReadTokens = msgUsage.cache_read_input_tokens ?? void 0;
2496
+ cacheWriteTokens = msgUsage.cache_creation_input_tokens ?? void 0;
2497
+ for (const block of lastMessage.message.content) {
2498
+ if (block.type === "text") {
2499
+ text2 += block.text;
2500
+ } else if (block.type === "thinking") {
2501
+ thinking += block.thinking;
2502
+ } else if (block.type === "tool_use") {
2503
+ toolCalls.push({ toolName: block.name, args: block.input });
2504
+ }
2469
2505
  }
2470
2506
  }
2471
2507
  steps.push({
@@ -2474,13 +2510,14 @@ function processMessages(timestampedMessages, startTime, endTime) {
2474
2510
  usage: {
2475
2511
  inputTokens,
2476
2512
  outputTokens,
2477
- totalTokens: inputTokens + outputTokens
2513
+ totalTokens: inputTokens + outputTokens,
2514
+ cacheReadTokens: cacheReadTokens || void 0,
2515
+ cacheWriteTokens: cacheWriteTokens || void 0
2478
2516
  },
2479
- finishReason: mapStopReason(lastMessage.message.stop_reason),
2517
+ finishReason: mapStopReason(lastStopReason),
2480
2518
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
2481
2519
  startedAt: stepStartTime,
2482
2520
  durationMs: Math.max(0, durationMs)
2483
- // Ensure non-negative
2484
2521
  });
2485
2522
  }
2486
2523
  return { steps, result };
@@ -2520,35 +2557,35 @@ function extractTotalUsage(result) {
2520
2557
  inputTokens: usage.input_tokens,
2521
2558
  outputTokens: usage.output_tokens,
2522
2559
  totalTokens: usage.input_tokens + usage.output_tokens,
2523
- costUsd: result.total_cost_usd
2560
+ costUsd: result.total_cost_usd,
2561
+ cacheReadTokens: usage.cache_read_input_tokens || void 0,
2562
+ cacheWriteTokens: usage.cache_creation_input_tokens || void 0,
2563
+ durationApiMs: result.duration_api_ms || void 0
2524
2564
  };
2525
2565
  }
2526
2566
  function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2527
2567
  const totalCost = usage.costUsd ?? 0;
2528
- const totalStepTokens = steps.reduce(
2529
- (sum, s) => sum + s.usage.totalTokens,
2568
+ const totalStepInputTokens = steps.reduce(
2569
+ (sum, s) => sum + s.usage.inputTokens,
2530
2570
  0
2531
2571
  );
2532
2572
  const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
2533
- const hasPerStepTokens = totalStepTokens > 0;
2573
+ const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
2534
2574
  const traceSteps = steps.map((step, index) => {
2535
- let tokenUsage;
2575
+ let stepPromptTokens;
2576
+ let stepOutputTokens;
2536
2577
  let proportion;
2537
- if (hasPerStepTokens) {
2538
- proportion = step.usage.totalTokens / totalStepTokens;
2539
- tokenUsage = {
2540
- prompt: step.usage.inputTokens,
2541
- completion: step.usage.outputTokens,
2542
- total: step.usage.totalTokens
2543
- };
2544
- } else {
2578
+ if (inputTokensDuplicated) {
2545
2579
  proportion = totalStepDuration > 0 ? step.durationMs / totalStepDuration : 0;
2546
- tokenUsage = {
2547
- prompt: Math.round(usage.inputTokens * proportion),
2548
- completion: Math.round(usage.outputTokens * proportion),
2549
- total: Math.round(usage.totalTokens * proportion)
2550
- };
2580
+ stepPromptTokens = Math.round(usage.inputTokens * proportion);
2581
+ stepOutputTokens = Math.round(usage.outputTokens * proportion);
2582
+ } else {
2583
+ proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
2584
+ stepPromptTokens = step.usage.inputTokens;
2585
+ stepOutputTokens = Math.round(usage.outputTokens * proportion);
2551
2586
  }
2587
+ const stepTotalTokens = stepPromptTokens + stepOutputTokens;
2588
+ const costProportion = proportion;
2552
2589
  const stepType = step.toolCalls?.length ? LLMStepType.TOOL_USE : step.thinking && !step.text ? LLMStepType.THINKING : LLMStepType.COMPLETION;
2553
2590
  return {
2554
2591
  id: randomUUID(),
@@ -2558,11 +2595,15 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2558
2595
  provider: "anthropic",
2559
2596
  startedAt: step.startedAt.toISOString(),
2560
2597
  durationMs: step.durationMs,
2561
- tokenUsage,
2562
- costUsd: totalCost * proportion,
2598
+ tokenUsage: {
2599
+ prompt: stepPromptTokens,
2600
+ completion: stepOutputTokens,
2601
+ total: stepTotalTokens
2602
+ },
2603
+ costUsd: totalCost * costProportion,
2563
2604
  toolName: step.toolCalls?.[0]?.toolName,
2564
2605
  toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
2565
- outputPreview: step.text?.slice(0, 200),
2606
+ outputPreview: (step.text || step.thinking)?.slice(0, 200),
2566
2607
  success: step.finishReason !== "error",
2567
2608
  error: step.finishReason === "error" ? "Generation failed" : void 0
2568
2609
  };
@@ -2572,21 +2613,35 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2572
2613
  completion: usage.outputTokens,
2573
2614
  total: usage.totalTokens
2574
2615
  };
2575
- const finalCost = totalCost;
2616
+ const stepTypeBreakdown = {};
2617
+ for (const ts of traceSteps) {
2618
+ const entry = stepTypeBreakdown[ts.type] ?? {
2619
+ count: 0,
2620
+ durationMs: 0,
2621
+ tokens: 0,
2622
+ costUsd: 0
2623
+ };
2624
+ entry.count += 1;
2625
+ entry.durationMs += ts.durationMs;
2626
+ entry.tokens += ts.tokenUsage.total;
2627
+ entry.costUsd += ts.costUsd;
2628
+ stepTypeBreakdown[ts.type] = entry;
2629
+ }
2576
2630
  const summary = {
2577
2631
  totalSteps: traceSteps.length,
2578
2632
  totalDurationMs,
2579
2633
  totalTokens: finalTokens,
2580
- totalCostUsd: finalCost,
2634
+ totalCostUsd: totalCost,
2581
2635
  modelBreakdown: {
2582
2636
  [model]: {
2583
2637
  count: traceSteps.length,
2584
2638
  durationMs: totalDurationMs,
2585
2639
  tokens: finalTokens.total,
2586
- costUsd: finalCost
2640
+ costUsd: totalCost
2587
2641
  }
2588
2642
  },
2589
- modelsUsed: [model]
2643
+ modelsUsed: [model],
2644
+ stepTypeBreakdown
2590
2645
  };
2591
2646
  return {
2592
2647
  id: randomUUID(),
@@ -2626,6 +2681,7 @@ var ClaudeCodeAdapter = class {
2626
2681
  model: modelForSdk,
2627
2682
  temperature: modelConfig?.temperature,
2628
2683
  maxTokens: modelConfig?.maxTokens,
2684
+ maxTurns: modelConfig?.maxTurns,
2629
2685
  aiGatewayUrl,
2630
2686
  aiGatewayHeaders,
2631
2687
  traceContext,
@@ -40080,6 +40136,7 @@ var openai = createOpenAI();
40080
40136
  // src/run-scenario/agents/simple-agent/execute.ts
40081
40137
  import {
40082
40138
  AVAILABLE_CLAUDE_MODEL_IDS,
40139
+ OPENAI_RESPONSES_MODEL_IDS,
40083
40140
  LLMStepType as LLMStepType2,
40084
40141
  LiveTraceEventType as LiveTraceEventType2
40085
40142
  } from "@wix/evalforge-types";
@@ -42662,7 +42719,6 @@ var MODEL_PRICING = {
42662
42719
  CLAUDE_4_OPUS_1_0: { input: 15, output: 75 },
42663
42720
  CLAUDE_4_SONNET_1_0: { input: 3, output: 15 },
42664
42721
  // Anthropic — Claude 3.x
42665
- CLAUDE_3_7_SONNET_1_0: { input: 3, output: 15 },
42666
42722
  CLAUDE_3_5_SONNET_2_0: { input: 3, output: 15 },
42667
42723
  CLAUDE_3_5_SONNET_1_0: { input: 3, output: 15 },
42668
42724
  CLAUDE_3_HAIKU_1_0: { input: 0.25, output: 1.25 },
@@ -42780,7 +42836,7 @@ function estimateStepTimestamp(startMs, stepIndex, totalSteps) {
42780
42836
  // src/run-scenario/agents/simple-agent/execute.ts
42781
42837
  var PROVIDER_ANTHROPIC2 = "anthropic";
42782
42838
  var PROVIDER_OPENAI = "openai";
42783
- var MAX_TOOL_STEPS = 25;
42839
+ var DEFAULT_MAX_TOOL_STEPS = 25;
42784
42840
  function createModel(modelId, baseUrl, headers) {
42785
42841
  const isClaudeModel = AVAILABLE_CLAUDE_MODEL_IDS.includes(
42786
42842
  modelId
@@ -42798,6 +42854,9 @@ function createModel(modelId, baseUrl, headers) {
42798
42854
  apiKey: "proxy-auth",
42799
42855
  headers
42800
42856
  });
42857
+ if (OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
42858
+ return openai2.responses(modelId);
42859
+ }
42801
42860
  return openai2.chat(modelId);
42802
42861
  }
42803
42862
  function isClaudeModelId(modelId) {
@@ -42834,14 +42893,34 @@ async function executeWithAiSdk(context2) {
42834
42893
  emitStartEvent(traceContext, startTime);
42835
42894
  }
42836
42895
  try {
42896
+ const isAnthropic = provider === PROVIDER_ANTHROPIC2;
42897
+ const isResponsesAPI = OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
42898
+ const supportsThinking = isAnthropic || isResponsesAPI;
42899
+ const providerOpts = {
42900
+ ...isAnthropic && {
42901
+ anthropic: {
42902
+ thinking: { type: "enabled", budgetTokens: 1e4 }
42903
+ }
42904
+ },
42905
+ ...isResponsesAPI && {
42906
+ openai: {
42907
+ // Prevent the SDK from sending item_reference inputs that the proxy can't forward
42908
+ store: false,
42909
+ forceReasoning: true,
42910
+ reasoningEffort: "high",
42911
+ reasoningSummary: "detailed"
42912
+ }
42913
+ }
42914
+ };
42837
42915
  const result = await generateText({
42838
42916
  model,
42839
42917
  system: systemPrompt,
42840
42918
  prompt: scenario.triggerPrompt,
42841
- temperature: modelConfig.temperature,
42919
+ temperature: supportsThinking ? void 0 : modelConfig.temperature,
42842
42920
  maxOutputTokens: modelConfig.maxTokens,
42843
42921
  tools: mcpTools,
42844
- stopWhen: mcpTools ? stepCountIs(MAX_TOOL_STEPS) : stepCountIs(1)
42922
+ stopWhen: mcpTools ? stepCountIs(modelConfig.maxTurns ?? DEFAULT_MAX_TOOL_STEPS) : stepCountIs(1),
42923
+ providerOptions: providerOpts
42845
42924
  });
42846
42925
  const durationMs = Date.now() - startTime;
42847
42926
  const usage = {