@wix/evalforge-evaluator 0.101.0 → 0.103.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -870,6 +870,83 @@ async function writeRulesToFilesystem(cwd, rules) {
870
870
  console.log(`[Rules] Written ${rules.length} rule(s) to ${cwd}`);
871
871
  }
872
872
 
873
+ // src/run-scenario/agents/claude-code/build-conversation.ts
874
+ function isAssistantMessage(message) {
875
+ return message.type === "assistant";
876
+ }
877
+ function buildConversation(timestampedMessages) {
878
+ const messages = [];
879
+ for (const { message, receivedAt } of timestampedMessages) {
880
+ const timestamp = receivedAt.toISOString();
881
+ if (isAssistantMessage(message)) {
882
+ const content = [];
883
+ for (const block of message.message.content) {
884
+ if (block.type === "text") {
885
+ content.push({ type: "text", text: block.text });
886
+ } else if (block.type === "thinking") {
887
+ content.push({
888
+ type: "thinking",
889
+ thinking: block.thinking
890
+ });
891
+ } else if (block.type === "tool_use") {
892
+ content.push({
893
+ type: "tool_use",
894
+ toolName: block.name,
895
+ toolId: block.id,
896
+ input: block.input
897
+ });
898
+ }
899
+ }
900
+ if (content.length > 0) {
901
+ messages.push({ role: "assistant", content, timestamp });
902
+ }
903
+ } else if (message.type === "user") {
904
+ const userMsg = message;
905
+ const content = [];
906
+ const msgContent = userMsg.message?.content;
907
+ if (typeof msgContent === "string") {
908
+ content.push({ type: "text", text: msgContent });
909
+ } else if (Array.isArray(msgContent)) {
910
+ for (const block of msgContent) {
911
+ if (typeof block === "object" && block !== null) {
912
+ const b = block;
913
+ if (b.type === "tool_result") {
914
+ const rawContent = b.content;
915
+ let text = "";
916
+ if (typeof rawContent === "string") {
917
+ text = rawContent;
918
+ } else if (Array.isArray(rawContent)) {
919
+ text = rawContent.filter(
920
+ (c) => typeof c === "object" && c !== null && c.type === "text"
921
+ ).map((c) => c.text).join("\n");
922
+ }
923
+ content.push({
924
+ type: "tool_result",
925
+ toolUseId: String(b.tool_use_id ?? ""),
926
+ content: text,
927
+ isError: b.is_error === true ? true : void 0
928
+ });
929
+ } else if (b.type === "text" && typeof b.text === "string") {
930
+ content.push({ type: "text", text: b.text });
931
+ }
932
+ }
933
+ }
934
+ }
935
+ if (content.length > 0) {
936
+ messages.push({ role: "user", content, timestamp });
937
+ }
938
+ } else if (message.type === "system") {
939
+ const sysMsg = message;
940
+ messages.push({
941
+ role: "system",
942
+ content: [{ type: "text", text: sysMsg.subtype || "system" }],
943
+ timestamp
944
+ });
945
+ }
946
+ }
947
+ return messages;
948
+ }
949
+
873
950
  // src/run-scenario/agents/claude-code/execute.ts
874
951
  var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
875
952
  function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
@@ -1000,7 +1077,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1000
1077
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1001
1078
  isComplete
1002
1079
  };
1003
- if (isAssistantMessage(message)) {
1080
+ if (isAssistantMessage2(message)) {
1004
1081
  return createTraceEventFromMessage(
1005
1082
  message,
1006
1083
  context,
@@ -1016,15 +1093,25 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
1016
1093
  outputPreview = content.slice(0, 500);
1017
1094
  } else if (Array.isArray(content)) {
1018
1095
  for (const block of content) {
1019
- if (typeof block === "object" && "text" in block && block.text) {
1020
- outputPreview = String(block.text).slice(0, 500);
1096
+ if (typeof block !== "object" || block === null) continue;
1097
+ const b = block;
1098
+ if (b.type === "text" && typeof b.text === "string") {
1099
+ outputPreview = b.text.slice(0, 500);
1100
+ break;
1101
+ }
1102
+ if (b.type === "tool_result") {
1103
+ const raw = b.content;
1104
+ const preview = typeof raw === "string" ? raw : Array.isArray(raw) ? raw.filter(
1105
+ (c) => c.type === "text" && typeof c.text === "string"
1106
+ ).map((c) => c.text).join("\n") : "";
1107
+ outputPreview = preview.slice(0, 500);
1021
1108
  break;
1022
1109
  }
1023
1110
  }
1024
1111
  }
1025
1112
  return {
1026
1113
  ...baseEvent,
1027
- type: LiveTraceEventType.USER,
1114
+ type: LiveTraceEventType.TOOL_RESULT,
1028
1115
  outputPreview: outputPreview || "(tool result)"
1029
1116
  };
1030
1117
  }
@@ -1638,6 +1725,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
1638
1725
  usage,
1639
1726
  options.model || DEFAULT_MODEL
1640
1727
  );
1728
+ const conversation = buildConversation(allMessages);
1641
1729
  return {
1642
1730
  result: {
1643
1731
  outputText,
@@ -1649,7 +1737,8 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
1649
1737
  },
1650
1738
  costUsd: usage.costUsd
1651
1739
  },
1652
- llmTrace
1740
+ llmTrace,
1741
+ conversation
1653
1742
  };
1654
1743
  }
1655
1744
  function buildSdkEnvironment(options) {
@@ -1666,7 +1755,7 @@ function buildSdkEnvironment(options) {
1666
1755
  }
1667
1756
  return env;
1668
1757
  }
1669
- function isAssistantMessage(message) {
1758
+ function isAssistantMessage2(message) {
1670
1759
  return message.type === "assistant";
1671
1760
  }
1672
1761
  function isResultMessage(message) {
@@ -1677,7 +1766,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
1677
1766
  let result;
1678
1767
  const assistantMessageGroups = /* @__PURE__ */ new Map();
1679
1768
  for (const { message, receivedAt } of timestampedMessages) {
1680
- if (isAssistantMessage(message)) {
1769
+ if (isAssistantMessage2(message)) {
1681
1770
  const uuid = message.uuid;
1682
1771
  if (!assistantMessageGroups.has(uuid)) {
1683
1772
  assistantMessageGroups.set(uuid, {
@@ -1706,10 +1795,13 @@ function processMessages(timestampedMessages, startTime, endTime) {
1706
1795
  const inputTokens = usage.input_tokens;
1707
1796
  const outputTokens = usage.output_tokens;
1708
1797
  let text = "";
1798
+ let thinking = "";
1709
1799
  const toolCalls = [];
1710
1800
  for (const block of lastMessage.message.content) {
1711
1801
  if (block.type === "text") {
1712
1802
  text += block.text;
1803
+ } else if (block.type === "thinking") {
1804
+ thinking += block.thinking;
1713
1805
  } else if (block.type === "tool_use") {
1714
1806
  toolCalls.push({
1715
1807
  toolName: block.name,
@@ -1719,6 +1811,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
1719
1811
  }
1720
1812
  steps.push({
1721
1813
  text,
1814
+ thinking: thinking || void 0,
1722
1815
  usage: {
1723
1816
  inputTokens,
1724
1817
  outputTokens,
@@ -1749,7 +1842,7 @@ function mapStopReason(stopReason) {
1749
1842
  function extractFinalOutput(timestampedMessages) {
1750
1843
  for (let i = timestampedMessages.length - 1; i >= 0; i--) {
1751
1844
  const { message } = timestampedMessages[i];
1752
- if (isAssistantMessage(message)) {
1845
+ if (isAssistantMessage2(message)) {
1753
1846
  for (const block of message.message.content) {
1754
1847
  if (block.type === "text" && block.text) {
1755
1848
  return block.text;
@@ -1779,10 +1872,11 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
1779
1872
  );
1780
1873
  const traceSteps = steps.map((step, index) => {
1781
1874
  const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
1875
+ const stepType = step.toolCalls?.length ? LLMStepType.TOOL_USE : step.thinking && !step.text ? LLMStepType.THINKING : LLMStepType.COMPLETION;
1782
1876
  return {
1783
1877
  id: randomUUID(),
1784
1878
  stepNumber: index + 1,
1785
- type: step.toolCalls?.length ? LLMStepType.TOOL_USE : LLMStepType.COMPLETION,
1879
+ type: stepType,
1786
1880
  model,
1787
1881
  provider: "anthropic",
1788
1882
  startedAt: step.startedAt.toISOString(),
@@ -1872,7 +1966,7 @@ var ClaudeCodeAdapter = class {
1872
1966
  rules,
1873
1967
  systemPrompt
1874
1968
  };
1875
- const { result, llmTrace } = await executeWithClaudeCode(
1969
+ const { result, llmTrace, conversation } = await executeWithClaudeCode(
1876
1970
  skills,
1877
1971
  scenario,
1878
1972
  options
@@ -1886,7 +1980,8 @@ var ClaudeCodeAdapter = class {
1886
1980
  totalTokens: result.usage.totalTokens
1887
1981
  },
1888
1982
  costUsd: result.costUsd,
1889
- llmTrace
1983
+ llmTrace,
1984
+ conversation
1890
1985
  };
1891
1986
  }
1892
1987
  };
@@ -2659,7 +2754,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
2659
2754
  rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
2660
2755
  systemPrompt: agent?.systemPrompt
2661
2756
  };
2662
- const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
2757
+ const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
2663
2758
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
2664
2759
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
2665
2760
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
@@ -2677,7 +2772,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
2677
2772
  templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
2678
2773
  startedAt,
2679
2774
  completedAt,
2680
- llmTrace
2775
+ llmTrace,
2776
+ conversation
2681
2777
  };
2682
2778
  }
2683
2779