@wix/evalforge-evaluator 0.102.0 → 0.103.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +109 -13
- package/build/index.js.map +4 -4
- package/build/index.mjs +109 -13
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/build-conversation.d.ts +8 -0
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +15 -1
- package/build/types/run-scenario/agents/claude-code/index.d.ts +1 -0
- package/build/types/run-scenario/types.d.ts +1 -0
- package/package.json +4 -4
package/build/index.js
CHANGED
|
@@ -879,6 +879,83 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
879
879
|
console.log(`[Rules] Written ${rules.length} rule(s) to ${cwd}`);
|
|
880
880
|
}
|
|
881
881
|
|
|
882
|
+
// src/run-scenario/agents/claude-code/build-conversation.ts
|
|
883
|
+
function isAssistantMessage(message) {
|
|
884
|
+
return message.type === "assistant";
|
|
885
|
+
}
|
|
886
|
+
function buildConversation(timestampedMessages) {
|
|
887
|
+
const messages = [];
|
|
888
|
+
for (const { message, receivedAt } of timestampedMessages) {
|
|
889
|
+
const timestamp = receivedAt.toISOString();
|
|
890
|
+
if (isAssistantMessage(message)) {
|
|
891
|
+
const content = [];
|
|
892
|
+
for (const block of message.message.content) {
|
|
893
|
+
if (block.type === "text") {
|
|
894
|
+
content.push({ type: "text", text: block.text });
|
|
895
|
+
} else if (block.type === "thinking") {
|
|
896
|
+
content.push({
|
|
897
|
+
type: "thinking",
|
|
898
|
+
thinking: block.thinking
|
|
899
|
+
});
|
|
900
|
+
} else if (block.type === "tool_use") {
|
|
901
|
+
content.push({
|
|
902
|
+
type: "tool_use",
|
|
903
|
+
toolName: block.name,
|
|
904
|
+
toolId: block.id,
|
|
905
|
+
input: block.input
|
|
906
|
+
});
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
if (content.length > 0) {
|
|
910
|
+
messages.push({ role: "assistant", content, timestamp });
|
|
911
|
+
}
|
|
912
|
+
} else if (message.type === "user") {
|
|
913
|
+
const userMsg = message;
|
|
914
|
+
const content = [];
|
|
915
|
+
const msgContent = userMsg.message?.content;
|
|
916
|
+
if (typeof msgContent === "string") {
|
|
917
|
+
content.push({ type: "text", text: msgContent });
|
|
918
|
+
} else if (Array.isArray(msgContent)) {
|
|
919
|
+
for (const block of msgContent) {
|
|
920
|
+
if (typeof block === "object" && block !== null) {
|
|
921
|
+
const b = block;
|
|
922
|
+
if (b.type === "tool_result") {
|
|
923
|
+
const rawContent = b.content;
|
|
924
|
+
let text = "";
|
|
925
|
+
if (typeof rawContent === "string") {
|
|
926
|
+
text = rawContent;
|
|
927
|
+
} else if (Array.isArray(rawContent)) {
|
|
928
|
+
text = rawContent.filter(
|
|
929
|
+
(c) => typeof c === "object" && c !== null && c.type === "text"
|
|
930
|
+
).map((c) => c.text).join("\n");
|
|
931
|
+
}
|
|
932
|
+
content.push({
|
|
933
|
+
type: "tool_result",
|
|
934
|
+
toolUseId: String(b.tool_use_id ?? ""),
|
|
935
|
+
content: text,
|
|
936
|
+
isError: b.is_error === true ? true : void 0
|
|
937
|
+
});
|
|
938
|
+
} else if (b.type === "text" && typeof b.text === "string") {
|
|
939
|
+
content.push({ type: "text", text: b.text });
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
if (content.length > 0) {
|
|
945
|
+
messages.push({ role: "user", content, timestamp });
|
|
946
|
+
}
|
|
947
|
+
} else if (message.type === "system") {
|
|
948
|
+
const sysMsg = message;
|
|
949
|
+
messages.push({
|
|
950
|
+
role: "system",
|
|
951
|
+
content: [{ type: "text", text: sysMsg.subtype || "system" }],
|
|
952
|
+
timestamp
|
|
953
|
+
});
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
return messages;
|
|
957
|
+
}
|
|
958
|
+
|
|
882
959
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
883
960
|
var DEFAULT_MODEL = import_evalforge_types3.ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
884
961
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
@@ -1009,7 +1086,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1009
1086
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1010
1087
|
isComplete
|
|
1011
1088
|
};
|
|
1012
|
-
if (
|
|
1089
|
+
if (isAssistantMessage2(message)) {
|
|
1013
1090
|
return createTraceEventFromMessage(
|
|
1014
1091
|
message,
|
|
1015
1092
|
context,
|
|
@@ -1025,15 +1102,25 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1025
1102
|
outputPreview = content.slice(0, 500);
|
|
1026
1103
|
} else if (Array.isArray(content)) {
|
|
1027
1104
|
for (const block of content) {
|
|
1028
|
-
if (typeof block
|
|
1029
|
-
|
|
1105
|
+
if (typeof block !== "object" || block === null) continue;
|
|
1106
|
+
const b = block;
|
|
1107
|
+
if (b.type === "text" && typeof b.text === "string") {
|
|
1108
|
+
outputPreview = b.text.slice(0, 500);
|
|
1109
|
+
break;
|
|
1110
|
+
}
|
|
1111
|
+
if (b.type === "tool_result") {
|
|
1112
|
+
const raw = b.content;
|
|
1113
|
+
const preview = typeof raw === "string" ? raw : Array.isArray(raw) ? raw.filter(
|
|
1114
|
+
(c) => c.type === "text" && typeof c.text === "string"
|
|
1115
|
+
).map((c) => c.text).join("\n") : "";
|
|
1116
|
+
outputPreview = preview.slice(0, 500);
|
|
1030
1117
|
break;
|
|
1031
1118
|
}
|
|
1032
1119
|
}
|
|
1033
1120
|
}
|
|
1034
1121
|
return {
|
|
1035
1122
|
...baseEvent,
|
|
1036
|
-
type: import_evalforge_types3.LiveTraceEventType.
|
|
1123
|
+
type: import_evalforge_types3.LiveTraceEventType.TOOL_RESULT,
|
|
1037
1124
|
outputPreview: outputPreview || "(tool result)"
|
|
1038
1125
|
};
|
|
1039
1126
|
}
|
|
@@ -1647,6 +1734,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
1647
1734
|
usage,
|
|
1648
1735
|
options.model || DEFAULT_MODEL
|
|
1649
1736
|
);
|
|
1737
|
+
const conversation = buildConversation(allMessages);
|
|
1650
1738
|
return {
|
|
1651
1739
|
result: {
|
|
1652
1740
|
outputText,
|
|
@@ -1658,7 +1746,8 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
1658
1746
|
},
|
|
1659
1747
|
costUsd: usage.costUsd
|
|
1660
1748
|
},
|
|
1661
|
-
llmTrace
|
|
1749
|
+
llmTrace,
|
|
1750
|
+
conversation
|
|
1662
1751
|
};
|
|
1663
1752
|
}
|
|
1664
1753
|
function buildSdkEnvironment(options) {
|
|
@@ -1675,7 +1764,7 @@ function buildSdkEnvironment(options) {
|
|
|
1675
1764
|
}
|
|
1676
1765
|
return env;
|
|
1677
1766
|
}
|
|
1678
|
-
function
|
|
1767
|
+
function isAssistantMessage2(message) {
|
|
1679
1768
|
return message.type === "assistant";
|
|
1680
1769
|
}
|
|
1681
1770
|
function isResultMessage(message) {
|
|
@@ -1686,7 +1775,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
1686
1775
|
let result;
|
|
1687
1776
|
const assistantMessageGroups = /* @__PURE__ */ new Map();
|
|
1688
1777
|
for (const { message, receivedAt } of timestampedMessages) {
|
|
1689
|
-
if (
|
|
1778
|
+
if (isAssistantMessage2(message)) {
|
|
1690
1779
|
const uuid = message.uuid;
|
|
1691
1780
|
if (!assistantMessageGroups.has(uuid)) {
|
|
1692
1781
|
assistantMessageGroups.set(uuid, {
|
|
@@ -1715,10 +1804,13 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
1715
1804
|
const inputTokens = usage.input_tokens;
|
|
1716
1805
|
const outputTokens = usage.output_tokens;
|
|
1717
1806
|
let text = "";
|
|
1807
|
+
let thinking = "";
|
|
1718
1808
|
const toolCalls = [];
|
|
1719
1809
|
for (const block of lastMessage.message.content) {
|
|
1720
1810
|
if (block.type === "text") {
|
|
1721
1811
|
text += block.text;
|
|
1812
|
+
} else if (block.type === "thinking") {
|
|
1813
|
+
thinking += block.thinking;
|
|
1722
1814
|
} else if (block.type === "tool_use") {
|
|
1723
1815
|
toolCalls.push({
|
|
1724
1816
|
toolName: block.name,
|
|
@@ -1728,6 +1820,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
1728
1820
|
}
|
|
1729
1821
|
steps.push({
|
|
1730
1822
|
text,
|
|
1823
|
+
thinking: thinking || void 0,
|
|
1731
1824
|
usage: {
|
|
1732
1825
|
inputTokens,
|
|
1733
1826
|
outputTokens,
|
|
@@ -1758,7 +1851,7 @@ function mapStopReason(stopReason) {
|
|
|
1758
1851
|
function extractFinalOutput(timestampedMessages) {
|
|
1759
1852
|
for (let i = timestampedMessages.length - 1; i >= 0; i--) {
|
|
1760
1853
|
const { message } = timestampedMessages[i];
|
|
1761
|
-
if (
|
|
1854
|
+
if (isAssistantMessage2(message)) {
|
|
1762
1855
|
for (const block of message.message.content) {
|
|
1763
1856
|
if (block.type === "text" && block.text) {
|
|
1764
1857
|
return block.text;
|
|
@@ -1788,10 +1881,11 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1788
1881
|
);
|
|
1789
1882
|
const traceSteps = steps.map((step, index) => {
|
|
1790
1883
|
const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
|
|
1884
|
+
const stepType = step.toolCalls?.length ? import_evalforge_types3.LLMStepType.TOOL_USE : step.thinking && !step.text ? import_evalforge_types3.LLMStepType.THINKING : import_evalforge_types3.LLMStepType.COMPLETION;
|
|
1791
1885
|
return {
|
|
1792
1886
|
id: (0, import_crypto.randomUUID)(),
|
|
1793
1887
|
stepNumber: index + 1,
|
|
1794
|
-
type:
|
|
1888
|
+
type: stepType,
|
|
1795
1889
|
model,
|
|
1796
1890
|
provider: "anthropic",
|
|
1797
1891
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -1881,7 +1975,7 @@ var ClaudeCodeAdapter = class {
|
|
|
1881
1975
|
rules,
|
|
1882
1976
|
systemPrompt
|
|
1883
1977
|
};
|
|
1884
|
-
const { result, llmTrace } = await executeWithClaudeCode(
|
|
1978
|
+
const { result, llmTrace, conversation } = await executeWithClaudeCode(
|
|
1885
1979
|
skills,
|
|
1886
1980
|
scenario,
|
|
1887
1981
|
options
|
|
@@ -1895,7 +1989,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1895
1989
|
totalTokens: result.usage.totalTokens
|
|
1896
1990
|
},
|
|
1897
1991
|
costUsd: result.costUsd,
|
|
1898
|
-
llmTrace
|
|
1992
|
+
llmTrace,
|
|
1993
|
+
conversation
|
|
1899
1994
|
};
|
|
1900
1995
|
}
|
|
1901
1996
|
};
|
|
@@ -2668,7 +2763,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2668
2763
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
2669
2764
|
systemPrompt: agent?.systemPrompt
|
|
2670
2765
|
};
|
|
2671
|
-
const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
|
|
2766
|
+
const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
|
|
2672
2767
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2673
2768
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
2674
2769
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
@@ -2686,7 +2781,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2686
2781
|
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
2687
2782
|
startedAt,
|
|
2688
2783
|
completedAt,
|
|
2689
|
-
llmTrace
|
|
2784
|
+
llmTrace,
|
|
2785
|
+
conversation
|
|
2690
2786
|
};
|
|
2691
2787
|
}
|
|
2692
2788
|
|