@wix/evalforge-evaluator 0.106.0 → 0.108.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -1505,6 +1505,7 @@ async function writeMcpToFilesystem(cwd, mcps) {
1505
1505
  // src/run-scenario/agents/claude-code/write-sub-agents.ts
1506
1506
  var import_promises5 = require("fs/promises");
1507
1507
  var import_path6 = require("path");
1508
+ var import_evalforge_github_client3 = require("@wix/evalforge-github-client");
1508
1509
  var AGENTS_DIR = ".claude/agents";
1509
1510
  function toAgentFilename(name26, index, nameCount) {
1510
1511
  const base = (name26 || "").toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "").replace(/^-+|-+$/g, "") || `sub-agent-${index}`;
@@ -1512,7 +1513,34 @@ function toAgentFilename(name26, index, nameCount) {
1512
1513
  nameCount.set(base, count + 1);
1513
1514
  return count === 0 ? base : `${base}-${count + 1}`;
1514
1515
  }
1515
- async function writeSubAgentsToFilesystem(cwd, subAgents) {
1516
+ async function resolveSubAgentContent(agent, fetchFn) {
1517
+ if (agent.source) {
1518
+ try {
1519
+ const content = await fetchFn(agent.source, {
1520
+ userAgent: "EvalForge-Evaluator"
1521
+ });
1522
+ console.log(
1523
+ `[SubAgents] Fetched "${agent.name}" from ${agent.source.owner}/${agent.source.repo}/${agent.source.path}@${agent.source.ref}`
1524
+ );
1525
+ return content;
1526
+ } catch (error48) {
1527
+ const message = error48 instanceof Error ? error48.message : "Unknown error";
1528
+ console.error(
1529
+ `[SubAgents] "${agent.name}": GitHub fetch failed: ${message}`
1530
+ );
1531
+ throw new Error(
1532
+ `Failed to fetch sub-agent "${agent.name}" from GitHub: ${message}`
1533
+ );
1534
+ }
1535
+ }
1536
+ if (!agent.subAgentMd) {
1537
+ console.warn(
1538
+ `[SubAgents] "${agent.name}" has empty inline content \u2013 the agent file will be blank`
1539
+ );
1540
+ }
1541
+ return agent.subAgentMd;
1542
+ }
1543
+ async function writeSubAgentsToFilesystem(cwd, subAgents, fetchFn = import_evalforge_github_client3.fetchGitHubFile) {
1516
1544
  if (subAgents.length === 0) return;
1517
1545
  const agentsDir = (0, import_path6.join)(cwd, AGENTS_DIR);
1518
1546
  await (0, import_promises5.mkdir)(agentsDir, { recursive: true });
@@ -1520,7 +1548,8 @@ async function writeSubAgentsToFilesystem(cwd, subAgents) {
1520
1548
  for (const [i, agent] of subAgents.entries()) {
1521
1549
  const filename = toAgentFilename(agent.name, i, nameCount);
1522
1550
  const filePath = (0, import_path6.join)(agentsDir, `${filename}.md`);
1523
- await (0, import_promises5.writeFile)(filePath, agent.subAgentMd, "utf8");
1551
+ const content = await resolveSubAgentContent(agent, fetchFn);
1552
+ await (0, import_promises5.writeFile)(filePath, content, "utf8");
1524
1553
  }
1525
1554
  console.log(`[SubAgents] Written to ${agentsDir}`);
1526
1555
  }
@@ -1740,6 +1769,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1740
1769
  let toolArgs;
1741
1770
  let outputPreview;
1742
1771
  let filePath;
1772
+ let thinking;
1743
1773
  for (const block of message.message.content) {
1744
1774
  if (block.type === "tool_use") {
1745
1775
  type = import_evalforge_types4.LiveTraceEventType.TOOL_USE;
@@ -1756,6 +1786,15 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1756
1786
  }
1757
1787
  } else if (block.type === "text") {
1758
1788
  outputPreview = block.text.slice(0, 500);
1789
+ if (!toolName) {
1790
+ type = import_evalforge_types4.LiveTraceEventType.COMPLETION;
1791
+ }
1792
+ } else if (block.type === "thinking") {
1793
+ const thinkingBlock = block;
1794
+ thinking = thinkingBlock.thinking.slice(0, 500);
1795
+ if (!outputPreview && !toolName) {
1796
+ type = import_evalforge_types4.LiveTraceEventType.THINKING;
1797
+ }
1759
1798
  }
1760
1799
  }
1761
1800
  return {
@@ -1770,6 +1809,7 @@ function createTraceEventFromMessage(message, context2, stepNumber, isComplete)
1770
1809
  toolArgs,
1771
1810
  outputPreview,
1772
1811
  filePath,
1812
+ thinking,
1773
1813
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1774
1814
  isComplete
1775
1815
  };
@@ -2395,28 +2435,53 @@ function processMessages(timestampedMessages, startTime, endTime) {
2395
2435
  const sortedGroups = Array.from(assistantMessageGroups.values()).sort(
2396
2436
  (a, b) => a.firstReceivedAt.getTime() - b.firstReceivedAt.getTime()
2397
2437
  );
2398
- for (let i = 0; i < sortedGroups.length; i++) {
2399
- const group = sortedGroups[i];
2400
- const lastMessage = group.messages[group.messages.length - 1];
2401
- const stepStartTime = group.firstReceivedAt;
2402
- const nextStepStartTime = i < sortedGroups.length - 1 ? sortedGroups[i + 1].firstReceivedAt : endTime;
2438
+ const mergedTurns = [];
2439
+ for (const group of sortedGroups) {
2440
+ const lastMsg = group.messages[group.messages.length - 1];
2441
+ const inputTokens = lastMsg.message.usage.input_tokens;
2442
+ const prev = mergedTurns[mergedTurns.length - 1];
2443
+ const prevLastMsg = prev?.groups[prev.groups.length - 1].messages.at(-1);
2444
+ const prevInputTokens = prevLastMsg?.message.usage.input_tokens;
2445
+ if (prev && prevInputTokens === inputTokens) {
2446
+ prev.groups.push(group);
2447
+ prev.lastReceivedAt = group.lastReceivedAt;
2448
+ } else {
2449
+ mergedTurns.push({
2450
+ groups: [group],
2451
+ firstReceivedAt: group.firstReceivedAt,
2452
+ lastReceivedAt: group.lastReceivedAt
2453
+ });
2454
+ }
2455
+ }
2456
+ for (let i = 0; i < mergedTurns.length; i++) {
2457
+ const turn = mergedTurns[i];
2458
+ const stepStartTime = turn.firstReceivedAt;
2459
+ const nextStepStartTime = i < mergedTurns.length - 1 ? mergedTurns[i + 1].firstReceivedAt : endTime;
2403
2460
  const durationMs = nextStepStartTime.getTime() - stepStartTime.getTime();
2404
- const usage = lastMessage.message.usage;
2405
- const inputTokens = usage.input_tokens;
2406
- const outputTokens = usage.output_tokens;
2407
2461
  let text2 = "";
2408
2462
  let thinking = "";
2409
2463
  const toolCalls = [];
2410
- for (const block of lastMessage.message.content) {
2411
- if (block.type === "text") {
2412
- text2 += block.text;
2413
- } else if (block.type === "thinking") {
2414
- thinking += block.thinking;
2415
- } else if (block.type === "tool_use") {
2416
- toolCalls.push({
2417
- toolName: block.name,
2418
- args: block.input
2419
- });
2464
+ let lastStopReason = null;
2465
+ let inputTokens = 0;
2466
+ let outputTokens = 0;
2467
+ let cacheReadTokens;
2468
+ let cacheWriteTokens;
2469
+ for (const group of turn.groups) {
2470
+ const lastMessage = group.messages[group.messages.length - 1];
2471
+ lastStopReason = lastMessage.message.stop_reason;
2472
+ const msgUsage = lastMessage.message.usage;
2473
+ inputTokens = msgUsage.input_tokens;
2474
+ outputTokens = msgUsage.output_tokens;
2475
+ cacheReadTokens = msgUsage.cache_read_input_tokens ?? void 0;
2476
+ cacheWriteTokens = msgUsage.cache_creation_input_tokens ?? void 0;
2477
+ for (const block of lastMessage.message.content) {
2478
+ if (block.type === "text") {
2479
+ text2 += block.text;
2480
+ } else if (block.type === "thinking") {
2481
+ thinking += block.thinking;
2482
+ } else if (block.type === "tool_use") {
2483
+ toolCalls.push({ toolName: block.name, args: block.input });
2484
+ }
2420
2485
  }
2421
2486
  }
2422
2487
  steps.push({
@@ -2425,13 +2490,14 @@ function processMessages(timestampedMessages, startTime, endTime) {
2425
2490
  usage: {
2426
2491
  inputTokens,
2427
2492
  outputTokens,
2428
- totalTokens: inputTokens + outputTokens
2493
+ totalTokens: inputTokens + outputTokens,
2494
+ cacheReadTokens: cacheReadTokens || void 0,
2495
+ cacheWriteTokens: cacheWriteTokens || void 0
2429
2496
  },
2430
- finishReason: mapStopReason(lastMessage.message.stop_reason),
2497
+ finishReason: mapStopReason(lastStopReason),
2431
2498
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
2432
2499
  startedAt: stepStartTime,
2433
2500
  durationMs: Math.max(0, durationMs)
2434
- // Ensure non-negative
2435
2501
  });
2436
2502
  }
2437
2503
  return { steps, result };
@@ -2471,35 +2537,35 @@ function extractTotalUsage(result) {
2471
2537
  inputTokens: usage.input_tokens,
2472
2538
  outputTokens: usage.output_tokens,
2473
2539
  totalTokens: usage.input_tokens + usage.output_tokens,
2474
- costUsd: result.total_cost_usd
2540
+ costUsd: result.total_cost_usd,
2541
+ cacheReadTokens: usage.cache_read_input_tokens || void 0,
2542
+ cacheWriteTokens: usage.cache_creation_input_tokens || void 0,
2543
+ durationApiMs: result.duration_api_ms || void 0
2475
2544
  };
2476
2545
  }
2477
2546
  function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2478
2547
  const totalCost = usage.costUsd ?? 0;
2479
- const totalStepTokens = steps.reduce(
2480
- (sum, s) => sum + s.usage.totalTokens,
2548
+ const totalStepInputTokens = steps.reduce(
2549
+ (sum, s) => sum + s.usage.inputTokens,
2481
2550
  0
2482
2551
  );
2483
2552
  const totalStepDuration = steps.reduce((sum, s) => sum + s.durationMs, 0);
2484
- const hasPerStepTokens = totalStepTokens > 0;
2553
+ const inputTokensDuplicated = usage.inputTokens > 0 && totalStepInputTokens > usage.inputTokens * 1.2;
2485
2554
  const traceSteps = steps.map((step, index) => {
2486
- let tokenUsage;
2555
+ let stepPromptTokens;
2556
+ let stepOutputTokens;
2487
2557
  let proportion;
2488
- if (hasPerStepTokens) {
2489
- proportion = step.usage.totalTokens / totalStepTokens;
2490
- tokenUsage = {
2491
- prompt: step.usage.inputTokens,
2492
- completion: step.usage.outputTokens,
2493
- total: step.usage.totalTokens
2494
- };
2495
- } else {
2558
+ if (inputTokensDuplicated) {
2496
2559
  proportion = totalStepDuration > 0 ? step.durationMs / totalStepDuration : 0;
2497
- tokenUsage = {
2498
- prompt: Math.round(usage.inputTokens * proportion),
2499
- completion: Math.round(usage.outputTokens * proportion),
2500
- total: Math.round(usage.totalTokens * proportion)
2501
- };
2560
+ stepPromptTokens = Math.round(usage.inputTokens * proportion);
2561
+ stepOutputTokens = Math.round(usage.outputTokens * proportion);
2562
+ } else {
2563
+ proportion = totalStepInputTokens > 0 ? step.usage.inputTokens / totalStepInputTokens : 0;
2564
+ stepPromptTokens = step.usage.inputTokens;
2565
+ stepOutputTokens = Math.round(usage.outputTokens * proportion);
2502
2566
  }
2567
+ const stepTotalTokens = stepPromptTokens + stepOutputTokens;
2568
+ const costProportion = proportion;
2503
2569
  const stepType = step.toolCalls?.length ? import_evalforge_types4.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types4.LLMStepType.THINKING : import_evalforge_types4.LLMStepType.COMPLETION;
2504
2570
  return {
2505
2571
  id: (0, import_crypto.randomUUID)(),
@@ -2509,11 +2575,15 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2509
2575
  provider: "anthropic",
2510
2576
  startedAt: step.startedAt.toISOString(),
2511
2577
  durationMs: step.durationMs,
2512
- tokenUsage,
2513
- costUsd: totalCost * proportion,
2578
+ tokenUsage: {
2579
+ prompt: stepPromptTokens,
2580
+ completion: stepOutputTokens,
2581
+ total: stepTotalTokens
2582
+ },
2583
+ costUsd: totalCost * costProportion,
2514
2584
  toolName: step.toolCalls?.[0]?.toolName,
2515
2585
  toolArguments: step.toolCalls?.[0] ? JSON.stringify(step.toolCalls[0].args) : void 0,
2516
- outputPreview: step.text?.slice(0, 200),
2586
+ outputPreview: (step.text || step.thinking)?.slice(0, 200),
2517
2587
  success: step.finishReason !== "error",
2518
2588
  error: step.finishReason === "error" ? "Generation failed" : void 0
2519
2589
  };
@@ -2523,21 +2593,35 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
2523
2593
  completion: usage.outputTokens,
2524
2594
  total: usage.totalTokens
2525
2595
  };
2526
- const finalCost = totalCost;
2596
+ const stepTypeBreakdown = {};
2597
+ for (const ts of traceSteps) {
2598
+ const entry = stepTypeBreakdown[ts.type] ?? {
2599
+ count: 0,
2600
+ durationMs: 0,
2601
+ tokens: 0,
2602
+ costUsd: 0
2603
+ };
2604
+ entry.count += 1;
2605
+ entry.durationMs += ts.durationMs;
2606
+ entry.tokens += ts.tokenUsage.total;
2607
+ entry.costUsd += ts.costUsd;
2608
+ stepTypeBreakdown[ts.type] = entry;
2609
+ }
2527
2610
  const summary = {
2528
2611
  totalSteps: traceSteps.length,
2529
2612
  totalDurationMs,
2530
2613
  totalTokens: finalTokens,
2531
- totalCostUsd: finalCost,
2614
+ totalCostUsd: totalCost,
2532
2615
  modelBreakdown: {
2533
2616
  [model]: {
2534
2617
  count: traceSteps.length,
2535
2618
  durationMs: totalDurationMs,
2536
2619
  tokens: finalTokens.total,
2537
- costUsd: finalCost
2620
+ costUsd: totalCost
2538
2621
  }
2539
2622
  },
2540
- modelsUsed: [model]
2623
+ modelsUsed: [model],
2624
+ stepTypeBreakdown
2541
2625
  };
2542
2626
  return {
2543
2627
  id: (0, import_crypto.randomUUID)(),
@@ -42745,6 +42829,9 @@ function createModel(modelId, baseUrl, headers) {
42745
42829
  apiKey: "proxy-auth",
42746
42830
  headers
42747
42831
  });
42832
+ if (import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelId)) {
42833
+ return openai2.responses(modelId);
42834
+ }
42748
42835
  return openai2.chat(modelId);
42749
42836
  }
42750
42837
  function isClaudeModelId(modelId) {
@@ -42781,14 +42868,34 @@ async function executeWithAiSdk(context2) {
42781
42868
  emitStartEvent(traceContext, startTime);
42782
42869
  }
42783
42870
  try {
42871
+ const isAnthropic = provider === PROVIDER_ANTHROPIC2;
42872
+ const isResponsesAPI = import_evalforge_types6.OPENAI_RESPONSES_MODEL_IDS.has(modelConfig.model);
42873
+ const supportsThinking = isAnthropic || isResponsesAPI;
42874
+ const providerOpts = {
42875
+ ...isAnthropic && {
42876
+ anthropic: {
42877
+ thinking: { type: "enabled", budgetTokens: 1e4 }
42878
+ }
42879
+ },
42880
+ ...isResponsesAPI && {
42881
+ openai: {
42882
+ // Prevent the SDK from sending item_reference inputs that the proxy can't forward
42883
+ store: false,
42884
+ forceReasoning: true,
42885
+ reasoningEffort: "high",
42886
+ reasoningSummary: "detailed"
42887
+ }
42888
+ }
42889
+ };
42784
42890
  const result = await generateText({
42785
42891
  model,
42786
42892
  system: systemPrompt,
42787
42893
  prompt: scenario.triggerPrompt,
42788
- temperature: modelConfig.temperature,
42894
+ temperature: supportsThinking ? void 0 : modelConfig.temperature,
42789
42895
  maxOutputTokens: modelConfig.maxTokens,
42790
42896
  tools: mcpTools,
42791
- stopWhen: mcpTools ? stepCountIs(MAX_TOOL_STEPS) : stepCountIs(1)
42897
+ stopWhen: mcpTools ? stepCountIs(MAX_TOOL_STEPS) : stepCountIs(1),
42898
+ providerOptions: providerOpts
42792
42899
  });
42793
42900
  const durationMs = Date.now() - startTime;
42794
42901
  const usage = {