@wix/evalforge-evaluator 0.101.0 → 0.103.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -879,6 +879,83 @@ async function writeRulesToFilesystem(cwd, rules) {
879
879
  console.log(`[Rules] Written ${rules.length} rule(s) to ${cwd}`);
880
880
  }
881
881
 
882
+ // src/run-scenario/agents/claude-code/build-conversation.ts
883
+ function isAssistantMessage(message) {
884
+ return message.type === "assistant";
885
+ }
886
+ function buildConversation(timestampedMessages) {
887
+ const messages = [];
888
+ for (const { message, receivedAt } of timestampedMessages) {
889
+ const timestamp = receivedAt.toISOString();
890
+ if (isAssistantMessage(message)) {
891
+ const content = [];
892
+ for (const block of message.message.content) {
893
+ if (block.type === "text") {
894
+ content.push({ type: "text", text: block.text });
895
+ } else if (block.type === "thinking") {
896
+ content.push({
897
+ type: "thinking",
898
+ thinking: block.thinking
899
+ });
900
+ } else if (block.type === "tool_use") {
901
+ content.push({
902
+ type: "tool_use",
903
+ toolName: block.name,
904
+ toolId: block.id,
905
+ input: block.input
906
+ });
907
+ }
908
+ }
909
+ if (content.length > 0) {
910
+ messages.push({ role: "assistant", content, timestamp });
911
+ }
912
+ } else if (message.type === "user") {
913
+ const userMsg = message;
914
+ const content = [];
915
+ const msgContent = userMsg.message?.content;
916
+ if (typeof msgContent === "string") {
917
+ content.push({ type: "text", text: msgContent });
918
+ } else if (Array.isArray(msgContent)) {
919
+ for (const block of msgContent) {
920
+ if (typeof block === "object" && block !== null) {
921
+ const b = block;
922
+ if (b.type === "tool_result") {
923
+ const rawContent = b.content;
924
+ let text = "";
925
+ if (typeof rawContent === "string") {
926
+ text = rawContent;
927
+ } else if (Array.isArray(rawContent)) {
928
+ text = rawContent.filter(
929
+ (c) => typeof c === "object" && c !== null && c.type === "text"
930
+ ).map((c) => c.text).join("\n");
931
+ }
932
+ content.push({
933
+ type: "tool_result",
934
+ toolUseId: String(b.tool_use_id ?? ""),
935
+ content: text,
936
+ isError: b.is_error === true ? true : void 0
937
+ });
938
+ } else if (b.type === "text" && typeof b.text === "string") {
939
+ content.push({ type: "text", text: b.text });
940
+ }
941
+ }
942
+ }
943
+ }
944
+ if (content.length > 0) {
945
+ messages.push({ role: "user", content, timestamp });
946
+ }
947
+ } else if (message.type === "system") {
948
+ const sysMsg = message;
949
+ messages.push({
950
+ role: "system",
951
+ content: [{ type: "text", text: sysMsg.subtype || "system" }],
952
+ timestamp
953
+ });
954
+ }
955
+ }
956
+ return messages;
957
+ }
958
+
882
959
  // src/run-scenario/agents/claude-code/execute.ts
883
960
  var DEFAULT_MODEL = import_evalforge_types3.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
884
961
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
@@ -1009,7 +1086,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1009
1086
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1010
1087
  isComplete
1011
1088
  };
1012
- if (isAssistantMessage(message)) {
1089
+ if (isAssistantMessage2(message)) {
1013
1090
  return createTraceEventFromMessage(
1014
1091
  message,
1015
1092
  context,
@@ -1025,15 +1102,25 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1025
1102
  outputPreview = content.slice(0, 500);
1026
1103
  } else if (Array.isArray(content)) {
1027
1104
  for (const block of content) {
1028
- if (typeof block === "object" && "text" in block && block.text) {
1029
- outputPreview = String(block.text).slice(0, 500);
1105
+ if (typeof block !== "object" || block === null) continue;
1106
+ const b = block;
1107
+ if (b.type === "text" && typeof b.text === "string") {
1108
+ outputPreview = b.text.slice(0, 500);
1109
+ break;
1110
+ }
1111
+ if (b.type === "tool_result") {
1112
+ const raw = b.content;
1113
+ const preview = typeof raw === "string" ? raw : Array.isArray(raw) ? raw.filter(
1114
+ (c) => c.type === "text" && typeof c.text === "string"
1115
+ ).map((c) => c.text).join("\n") : "";
1116
+ outputPreview = preview.slice(0, 500);
1030
1117
  break;
1031
1118
  }
1032
1119
  }
1033
1120
  }
1034
1121
  return {
1035
1122
  ...baseEvent,
1036
- type: import_evalforge_types3.LiveTraceEventType.USER,
1123
+ type: import_evalforge_types3.LiveTraceEventType.TOOL_RESULT,
1037
1124
  outputPreview: outputPreview || "(tool result)"
1038
1125
  };
1039
1126
  }
@@ -1647,6 +1734,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
1647
1734
  usage,
1648
1735
  options.model || DEFAULT_MODEL
1649
1736
  );
1737
+ const conversation = buildConversation(allMessages);
1650
1738
  return {
1651
1739
  result: {
1652
1740
  outputText,
@@ -1658,7 +1746,8 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
1658
1746
  },
1659
1747
  costUsd: usage.costUsd
1660
1748
  },
1661
- llmTrace
1749
+ llmTrace,
1750
+ conversation
1662
1751
  };
1663
1752
  }
1664
1753
  function buildSdkEnvironment(options) {
@@ -1675,7 +1764,7 @@ function buildSdkEnvironment(options) {
1675
1764
  }
1676
1765
  return env;
1677
1766
  }
1678
- function isAssistantMessage(message) {
1767
+ function isAssistantMessage2(message) {
1679
1768
  return message.type === "assistant";
1680
1769
  }
1681
1770
  function isResultMessage(message) {
@@ -1686,7 +1775,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
1686
1775
  let result;
1687
1776
  const assistantMessageGroups = /* @__PURE__ */ new Map();
1688
1777
  for (const { message, receivedAt } of timestampedMessages) {
1689
- if (isAssistantMessage(message)) {
1778
+ if (isAssistantMessage2(message)) {
1690
1779
  const uuid = message.uuid;
1691
1780
  if (!assistantMessageGroups.has(uuid)) {
1692
1781
  assistantMessageGroups.set(uuid, {
@@ -1715,10 +1804,13 @@ function processMessages(timestampedMessages, startTime, endTime) {
1715
1804
  const inputTokens = usage.input_tokens;
1716
1805
  const outputTokens = usage.output_tokens;
1717
1806
  let text = "";
1807
+ let thinking = "";
1718
1808
  const toolCalls = [];
1719
1809
  for (const block of lastMessage.message.content) {
1720
1810
  if (block.type === "text") {
1721
1811
  text += block.text;
1812
+ } else if (block.type === "thinking") {
1813
+ thinking += block.thinking;
1722
1814
  } else if (block.type === "tool_use") {
1723
1815
  toolCalls.push({
1724
1816
  toolName: block.name,
@@ -1728,6 +1820,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
1728
1820
  }
1729
1821
  steps.push({
1730
1822
  text,
1823
+ thinking: thinking || void 0,
1731
1824
  usage: {
1732
1825
  inputTokens,
1733
1826
  outputTokens,
@@ -1758,7 +1851,7 @@ function mapStopReason(stopReason) {
1758
1851
  function extractFinalOutput(timestampedMessages) {
1759
1852
  for (let i = timestampedMessages.length - 1; i >= 0; i--) {
1760
1853
  const { message } = timestampedMessages[i];
1761
- if (isAssistantMessage(message)) {
1854
+ if (isAssistantMessage2(message)) {
1762
1855
  for (const block of message.message.content) {
1763
1856
  if (block.type === "text" && block.text) {
1764
1857
  return block.text;
@@ -1788,10 +1881,11 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1788
1881
  );
1789
1882
  const traceSteps = steps.map((step, index) => {
1790
1883
  const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
1884
+ const stepType = step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types3.LLMStepType.THINKING : import_evalforge_types3.LLMStepType.COMPLETION;
1791
1885
  return {
1792
1886
  id: (0, import_crypto.randomUUID)(),
1793
1887
  stepNumber: index + 1,
1794
- type: step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : import_evalforge_types3.LLMStepType.COMPLETION,
1888
+ type: stepType,
1795
1889
  model,
1796
1890
  provider: "anthropic",
1797
1891
  startedAt: step.startedAt.toISOString(),
@@ -1881,7 +1975,7 @@ var ClaudeCodeAdapter = class {
1881
1975
  rules,
1882
1976
  systemPrompt
1883
1977
  };
1884
- const { result, llmTrace } = await executeWithClaudeCode(
1978
+ const { result, llmTrace, conversation } = await executeWithClaudeCode(
1885
1979
  skills,
1886
1980
  scenario,
1887
1981
  options
@@ -1895,7 +1989,8 @@ var ClaudeCodeAdapter = class {
1895
1989
  totalTokens: result.usage.totalTokens
1896
1990
  },
1897
1991
  costUsd: result.costUsd,
1898
- llmTrace
1992
+ llmTrace,
1993
+ conversation
1899
1994
  };
1900
1995
  }
1901
1996
  };
@@ -2668,7 +2763,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
2668
2763
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
2669
2764
  systemPrompt: agent?.systemPrompt
2670
2765
  };
2671
- const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
2766
+ const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
2672
2767
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
2673
2768
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
2674
2769
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
@@ -2686,7 +2781,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
2686
2781
  templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
2687
2782
  startedAt,
2688
2783
  completedAt,
2689
- llmTrace
2784
+ llmTrace,
2785
+ conversation
2690
2786
  };
2691
2787
  }
2692
2788