@wix/evalforge-evaluator 0.101.0 → 0.103.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +109 -13
- package/build/index.js.map +4 -4
- package/build/index.mjs +109 -13
- package/build/index.mjs.map +4 -4
- package/build/types/run-scenario/agents/claude-code/build-conversation.d.ts +8 -0
- package/build/types/run-scenario/agents/claude-code/execute.d.ts +15 -1
- package/build/types/run-scenario/agents/claude-code/index.d.ts +1 -0
- package/build/types/run-scenario/types.d.ts +1 -0
- package/package.json +4 -4
package/build/index.mjs
CHANGED
|
@@ -870,6 +870,83 @@ async function writeRulesToFilesystem(cwd, rules) {
|
|
|
870
870
|
console.log(`[Rules] Written ${rules.length} rule(s) to ${cwd}`);
|
|
871
871
|
}
|
|
872
872
|
|
|
873
|
+
// src/run-scenario/agents/claude-code/build-conversation.ts
|
|
874
|
+
function isAssistantMessage(message) {
|
|
875
|
+
return message.type === "assistant";
|
|
876
|
+
}
|
|
877
|
+
function buildConversation(timestampedMessages) {
|
|
878
|
+
const messages = [];
|
|
879
|
+
for (const { message, receivedAt } of timestampedMessages) {
|
|
880
|
+
const timestamp = receivedAt.toISOString();
|
|
881
|
+
if (isAssistantMessage(message)) {
|
|
882
|
+
const content = [];
|
|
883
|
+
for (const block of message.message.content) {
|
|
884
|
+
if (block.type === "text") {
|
|
885
|
+
content.push({ type: "text", text: block.text });
|
|
886
|
+
} else if (block.type === "thinking") {
|
|
887
|
+
content.push({
|
|
888
|
+
type: "thinking",
|
|
889
|
+
thinking: block.thinking
|
|
890
|
+
});
|
|
891
|
+
} else if (block.type === "tool_use") {
|
|
892
|
+
content.push({
|
|
893
|
+
type: "tool_use",
|
|
894
|
+
toolName: block.name,
|
|
895
|
+
toolId: block.id,
|
|
896
|
+
input: block.input
|
|
897
|
+
});
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
if (content.length > 0) {
|
|
901
|
+
messages.push({ role: "assistant", content, timestamp });
|
|
902
|
+
}
|
|
903
|
+
} else if (message.type === "user") {
|
|
904
|
+
const userMsg = message;
|
|
905
|
+
const content = [];
|
|
906
|
+
const msgContent = userMsg.message?.content;
|
|
907
|
+
if (typeof msgContent === "string") {
|
|
908
|
+
content.push({ type: "text", text: msgContent });
|
|
909
|
+
} else if (Array.isArray(msgContent)) {
|
|
910
|
+
for (const block of msgContent) {
|
|
911
|
+
if (typeof block === "object" && block !== null) {
|
|
912
|
+
const b = block;
|
|
913
|
+
if (b.type === "tool_result") {
|
|
914
|
+
const rawContent = b.content;
|
|
915
|
+
let text = "";
|
|
916
|
+
if (typeof rawContent === "string") {
|
|
917
|
+
text = rawContent;
|
|
918
|
+
} else if (Array.isArray(rawContent)) {
|
|
919
|
+
text = rawContent.filter(
|
|
920
|
+
(c) => typeof c === "object" && c !== null && c.type === "text"
|
|
921
|
+
).map((c) => c.text).join("\n");
|
|
922
|
+
}
|
|
923
|
+
content.push({
|
|
924
|
+
type: "tool_result",
|
|
925
|
+
toolUseId: String(b.tool_use_id ?? ""),
|
|
926
|
+
content: text,
|
|
927
|
+
isError: b.is_error === true ? true : void 0
|
|
928
|
+
});
|
|
929
|
+
} else if (b.type === "text" && typeof b.text === "string") {
|
|
930
|
+
content.push({ type: "text", text: b.text });
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
if (content.length > 0) {
|
|
936
|
+
messages.push({ role: "user", content, timestamp });
|
|
937
|
+
}
|
|
938
|
+
} else if (message.type === "system") {
|
|
939
|
+
const sysMsg = message;
|
|
940
|
+
messages.push({
|
|
941
|
+
role: "system",
|
|
942
|
+
content: [{ type: "text", text: sysMsg.subtype || "system" }],
|
|
943
|
+
timestamp
|
|
944
|
+
});
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
return messages;
|
|
948
|
+
}
|
|
949
|
+
|
|
873
950
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
874
951
|
var DEFAULT_MODEL = ClaudeModel.CLAUDE_4_5_SONNET_1_0;
|
|
875
952
|
function emitTraceEvent(event, tracePushUrl, routeHeader, authToken) {
|
|
@@ -1000,7 +1077,7 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1000
1077
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1001
1078
|
isComplete
|
|
1002
1079
|
};
|
|
1003
|
-
if (
|
|
1080
|
+
if (isAssistantMessage2(message)) {
|
|
1004
1081
|
return createTraceEventFromMessage(
|
|
1005
1082
|
message,
|
|
1006
1083
|
context,
|
|
@@ -1016,15 +1093,25 @@ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete
|
|
|
1016
1093
|
outputPreview = content.slice(0, 500);
|
|
1017
1094
|
} else if (Array.isArray(content)) {
|
|
1018
1095
|
for (const block of content) {
|
|
1019
|
-
if (typeof block
|
|
1020
|
-
|
|
1096
|
+
if (typeof block !== "object" || block === null) continue;
|
|
1097
|
+
const b = block;
|
|
1098
|
+
if (b.type === "text" && typeof b.text === "string") {
|
|
1099
|
+
outputPreview = b.text.slice(0, 500);
|
|
1100
|
+
break;
|
|
1101
|
+
}
|
|
1102
|
+
if (b.type === "tool_result") {
|
|
1103
|
+
const raw = b.content;
|
|
1104
|
+
const preview = typeof raw === "string" ? raw : Array.isArray(raw) ? raw.filter(
|
|
1105
|
+
(c) => c.type === "text" && typeof c.text === "string"
|
|
1106
|
+
).map((c) => c.text).join("\n") : "";
|
|
1107
|
+
outputPreview = preview.slice(0, 500);
|
|
1021
1108
|
break;
|
|
1022
1109
|
}
|
|
1023
1110
|
}
|
|
1024
1111
|
}
|
|
1025
1112
|
return {
|
|
1026
1113
|
...baseEvent,
|
|
1027
|
-
type: LiveTraceEventType.
|
|
1114
|
+
type: LiveTraceEventType.TOOL_RESULT,
|
|
1028
1115
|
outputPreview: outputPreview || "(tool result)"
|
|
1029
1116
|
};
|
|
1030
1117
|
}
|
|
@@ -1638,6 +1725,7 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
1638
1725
|
usage,
|
|
1639
1726
|
options.model || DEFAULT_MODEL
|
|
1640
1727
|
);
|
|
1728
|
+
const conversation = buildConversation(allMessages);
|
|
1641
1729
|
return {
|
|
1642
1730
|
result: {
|
|
1643
1731
|
outputText,
|
|
@@ -1649,7 +1737,8 @@ Stack: ${errorStack.split("\n").slice(0, 5).join("\n")}` : "")
|
|
|
1649
1737
|
},
|
|
1650
1738
|
costUsd: usage.costUsd
|
|
1651
1739
|
},
|
|
1652
|
-
llmTrace
|
|
1740
|
+
llmTrace,
|
|
1741
|
+
conversation
|
|
1653
1742
|
};
|
|
1654
1743
|
}
|
|
1655
1744
|
function buildSdkEnvironment(options) {
|
|
@@ -1666,7 +1755,7 @@ function buildSdkEnvironment(options) {
|
|
|
1666
1755
|
}
|
|
1667
1756
|
return env;
|
|
1668
1757
|
}
|
|
1669
|
-
function
|
|
1758
|
+
function isAssistantMessage2(message) {
|
|
1670
1759
|
return message.type === "assistant";
|
|
1671
1760
|
}
|
|
1672
1761
|
function isResultMessage(message) {
|
|
@@ -1677,7 +1766,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
1677
1766
|
let result;
|
|
1678
1767
|
const assistantMessageGroups = /* @__PURE__ */ new Map();
|
|
1679
1768
|
for (const { message, receivedAt } of timestampedMessages) {
|
|
1680
|
-
if (
|
|
1769
|
+
if (isAssistantMessage2(message)) {
|
|
1681
1770
|
const uuid = message.uuid;
|
|
1682
1771
|
if (!assistantMessageGroups.has(uuid)) {
|
|
1683
1772
|
assistantMessageGroups.set(uuid, {
|
|
@@ -1706,10 +1795,13 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
1706
1795
|
const inputTokens = usage.input_tokens;
|
|
1707
1796
|
const outputTokens = usage.output_tokens;
|
|
1708
1797
|
let text = "";
|
|
1798
|
+
let thinking = "";
|
|
1709
1799
|
const toolCalls = [];
|
|
1710
1800
|
for (const block of lastMessage.message.content) {
|
|
1711
1801
|
if (block.type === "text") {
|
|
1712
1802
|
text += block.text;
|
|
1803
|
+
} else if (block.type === "thinking") {
|
|
1804
|
+
thinking += block.thinking;
|
|
1713
1805
|
} else if (block.type === "tool_use") {
|
|
1714
1806
|
toolCalls.push({
|
|
1715
1807
|
toolName: block.name,
|
|
@@ -1719,6 +1811,7 @@ function processMessages(timestampedMessages, startTime, endTime) {
|
|
|
1719
1811
|
}
|
|
1720
1812
|
steps.push({
|
|
1721
1813
|
text,
|
|
1814
|
+
thinking: thinking || void 0,
|
|
1722
1815
|
usage: {
|
|
1723
1816
|
inputTokens,
|
|
1724
1817
|
outputTokens,
|
|
@@ -1749,7 +1842,7 @@ function mapStopReason(stopReason) {
|
|
|
1749
1842
|
function extractFinalOutput(timestampedMessages) {
|
|
1750
1843
|
for (let i = timestampedMessages.length - 1; i >= 0; i--) {
|
|
1751
1844
|
const { message } = timestampedMessages[i];
|
|
1752
|
-
if (
|
|
1845
|
+
if (isAssistantMessage2(message)) {
|
|
1753
1846
|
for (const block of message.message.content) {
|
|
1754
1847
|
if (block.type === "text" && block.text) {
|
|
1755
1848
|
return block.text;
|
|
@@ -1779,10 +1872,11 @@ function buildLLMTraceFromSteps(steps, totalDurationMs, usage, model) {
|
|
|
1779
1872
|
);
|
|
1780
1873
|
const traceSteps = steps.map((step, index) => {
|
|
1781
1874
|
const proportion = totalStepTokens > 0 ? step.usage.totalTokens / totalStepTokens : 0;
|
|
1875
|
+
const stepType = step.toolCalls?.length ? LLMStepType.TOOL_USE : step.thinking && !step.text ? LLMStepType.THINKING : LLMStepType.COMPLETION;
|
|
1782
1876
|
return {
|
|
1783
1877
|
id: randomUUID(),
|
|
1784
1878
|
stepNumber: index + 1,
|
|
1785
|
-
type:
|
|
1879
|
+
type: stepType,
|
|
1786
1880
|
model,
|
|
1787
1881
|
provider: "anthropic",
|
|
1788
1882
|
startedAt: step.startedAt.toISOString(),
|
|
@@ -1872,7 +1966,7 @@ var ClaudeCodeAdapter = class {
|
|
|
1872
1966
|
rules,
|
|
1873
1967
|
systemPrompt
|
|
1874
1968
|
};
|
|
1875
|
-
const { result, llmTrace } = await executeWithClaudeCode(
|
|
1969
|
+
const { result, llmTrace, conversation } = await executeWithClaudeCode(
|
|
1876
1970
|
skills,
|
|
1877
1971
|
scenario,
|
|
1878
1972
|
options
|
|
@@ -1886,7 +1980,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1886
1980
|
totalTokens: result.usage.totalTokens
|
|
1887
1981
|
},
|
|
1888
1982
|
costUsd: result.costUsd,
|
|
1889
|
-
llmTrace
|
|
1983
|
+
llmTrace,
|
|
1984
|
+
conversation
|
|
1890
1985
|
};
|
|
1891
1986
|
}
|
|
1892
1987
|
};
|
|
@@ -2659,7 +2754,7 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2659
2754
|
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
2660
2755
|
systemPrompt: agent?.systemPrompt
|
|
2661
2756
|
};
|
|
2662
|
-
const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
|
|
2757
|
+
const { outputText, durationMs, llmTrace, conversation } = await adapter.execute(executionContext);
|
|
2663
2758
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2664
2759
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
2665
2760
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
@@ -2677,7 +2772,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2677
2772
|
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
2678
2773
|
startedAt,
|
|
2679
2774
|
completedAt,
|
|
2680
|
-
llmTrace
|
|
2775
|
+
llmTrace,
|
|
2776
|
+
conversation
|
|
2681
2777
|
};
|
|
2682
2778
|
}
|
|
2683
2779
|
|