@wix/evalforge-evaluator 0.28.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +137 -10
- package/build/index.js.map +2 -2
- package/build/index.mjs +137 -10
- package/build/index.mjs.map +2 -2
- package/build/types/run-scenario/file-diff.d.ts +9 -1
- package/package.json +3 -3
package/build/index.mjs
CHANGED
|
@@ -6639,13 +6639,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6639
6639
|
let toolName;
|
|
6640
6640
|
let toolArgs;
|
|
6641
6641
|
let outputPreview;
|
|
6642
|
+
let filePath;
|
|
6642
6643
|
for (const block of message.message.content) {
|
|
6643
6644
|
if (block.type === "tool_use") {
|
|
6644
6645
|
type = LiveTraceEventType.TOOL_USE;
|
|
6645
6646
|
toolName = block.name;
|
|
6646
|
-
toolArgs = JSON.stringify(block.input).slice(0,
|
|
6647
|
+
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
6648
|
+
const input = block.input;
|
|
6649
|
+
if (input.file_path || input.path || input.target_file) {
|
|
6650
|
+
filePath = String(input.file_path || input.path || input.target_file);
|
|
6651
|
+
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
6652
|
+
type = LiveTraceEventType.FILE_WRITE;
|
|
6653
|
+
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
6654
|
+
type = LiveTraceEventType.FILE_READ;
|
|
6655
|
+
}
|
|
6656
|
+
}
|
|
6647
6657
|
} else if (block.type === "text") {
|
|
6648
|
-
outputPreview = block.text.slice(0,
|
|
6658
|
+
outputPreview = block.text.slice(0, 500);
|
|
6649
6659
|
}
|
|
6650
6660
|
}
|
|
6651
6661
|
return {
|
|
@@ -6659,10 +6669,64 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6659
6669
|
toolName,
|
|
6660
6670
|
toolArgs,
|
|
6661
6671
|
outputPreview,
|
|
6672
|
+
filePath,
|
|
6662
6673
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6663
6674
|
isComplete
|
|
6664
6675
|
};
|
|
6665
6676
|
}
|
|
6677
|
+
function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
|
|
6678
|
+
const baseEvent = {
|
|
6679
|
+
evalRunId: context.evalRunId,
|
|
6680
|
+
scenarioId: context.scenarioId,
|
|
6681
|
+
scenarioName: context.scenarioName,
|
|
6682
|
+
targetId: context.targetId,
|
|
6683
|
+
targetName: context.targetName,
|
|
6684
|
+
stepNumber,
|
|
6685
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6686
|
+
isComplete
|
|
6687
|
+
};
|
|
6688
|
+
if (isAssistantMessage(message)) {
|
|
6689
|
+
return createTraceEventFromMessage(
|
|
6690
|
+
message,
|
|
6691
|
+
context,
|
|
6692
|
+
stepNumber,
|
|
6693
|
+
isComplete
|
|
6694
|
+
);
|
|
6695
|
+
}
|
|
6696
|
+
if (message.type === "user") {
|
|
6697
|
+
const userMsg = message;
|
|
6698
|
+
let outputPreview = "";
|
|
6699
|
+
if (userMsg.message?.content) {
|
|
6700
|
+
for (const block of userMsg.message.content) {
|
|
6701
|
+
if ("text" in block && block.text) {
|
|
6702
|
+
outputPreview = block.text.slice(0, 500);
|
|
6703
|
+
break;
|
|
6704
|
+
}
|
|
6705
|
+
}
|
|
6706
|
+
}
|
|
6707
|
+
return {
|
|
6708
|
+
...baseEvent,
|
|
6709
|
+
type: LiveTraceEventType.USER,
|
|
6710
|
+
outputPreview: outputPreview || "(tool result)"
|
|
6711
|
+
};
|
|
6712
|
+
}
|
|
6713
|
+
if (message.type === "system") {
|
|
6714
|
+
const sysMsg = message;
|
|
6715
|
+
return {
|
|
6716
|
+
...baseEvent,
|
|
6717
|
+
type: LiveTraceEventType.SYSTEM,
|
|
6718
|
+
outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
|
|
6719
|
+
};
|
|
6720
|
+
}
|
|
6721
|
+
if (message.type === "result") {
|
|
6722
|
+
return null;
|
|
6723
|
+
}
|
|
6724
|
+
return {
|
|
6725
|
+
...baseEvent,
|
|
6726
|
+
type: LiveTraceEventType.PROGRESS,
|
|
6727
|
+
outputPreview: `Message type: ${message.type}`
|
|
6728
|
+
};
|
|
6729
|
+
}
|
|
6666
6730
|
async function executeWithClaudeCode(skill, scenario, options) {
|
|
6667
6731
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
6668
6732
|
skillId: skill.id,
|
|
@@ -6859,6 +6923,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6859
6923
|
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
6860
6924
|
let timeoutHandle;
|
|
6861
6925
|
let timedOut = false;
|
|
6926
|
+
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
6927
|
+
let heartbeatHandle;
|
|
6928
|
+
const executionStartTime = Date.now();
|
|
6862
6929
|
try {
|
|
6863
6930
|
const timeoutPromise = new Promise((_, reject) => {
|
|
6864
6931
|
timeoutHandle = setTimeout(() => {
|
|
@@ -6870,6 +6937,30 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6870
6937
|
);
|
|
6871
6938
|
}, SDK_TIMEOUT_MS);
|
|
6872
6939
|
});
|
|
6940
|
+
if (traceContext) {
|
|
6941
|
+
heartbeatHandle = setInterval(() => {
|
|
6942
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
6943
|
+
const progressEvent = {
|
|
6944
|
+
evalRunId: traceContext.evalRunId,
|
|
6945
|
+
scenarioId: traceContext.scenarioId,
|
|
6946
|
+
scenarioName: traceContext.scenarioName,
|
|
6947
|
+
targetId: traceContext.targetId,
|
|
6948
|
+
targetName: traceContext.targetName,
|
|
6949
|
+
stepNumber: traceStepNumber,
|
|
6950
|
+
type: LiveTraceEventType.PROGRESS,
|
|
6951
|
+
outputPreview: `Executing... (${Math.round(elapsedMs / 1e3)}s elapsed, ${messageCount} messages)`,
|
|
6952
|
+
elapsedMs,
|
|
6953
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6954
|
+
isComplete: false
|
|
6955
|
+
};
|
|
6956
|
+
emitTraceEvent(
|
|
6957
|
+
progressEvent,
|
|
6958
|
+
traceContext.tracePushUrl,
|
|
6959
|
+
traceContext.routeHeader,
|
|
6960
|
+
traceContext.authToken
|
|
6961
|
+
);
|
|
6962
|
+
}, HEARTBEAT_INTERVAL_MS);
|
|
6963
|
+
}
|
|
6873
6964
|
const sdkPromise = (async () => {
|
|
6874
6965
|
const evaluatorPromptSuffix = `
|
|
6875
6966
|
|
|
@@ -6892,21 +6983,23 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6892
6983
|
})
|
|
6893
6984
|
);
|
|
6894
6985
|
}
|
|
6895
|
-
if (traceContext
|
|
6986
|
+
if (traceContext) {
|
|
6896
6987
|
traceStepNumber++;
|
|
6897
|
-
const traceEvent =
|
|
6988
|
+
const traceEvent = createTraceEventFromAnyMessage(
|
|
6898
6989
|
message,
|
|
6899
6990
|
traceContext,
|
|
6900
6991
|
traceStepNumber,
|
|
6901
6992
|
false
|
|
6902
6993
|
// Not complete yet
|
|
6903
6994
|
);
|
|
6904
|
-
|
|
6905
|
-
|
|
6906
|
-
|
|
6907
|
-
|
|
6908
|
-
|
|
6909
|
-
|
|
6995
|
+
if (traceEvent) {
|
|
6996
|
+
emitTraceEvent(
|
|
6997
|
+
traceEvent,
|
|
6998
|
+
traceContext.tracePushUrl,
|
|
6999
|
+
traceContext.routeHeader,
|
|
7000
|
+
traceContext.authToken
|
|
7001
|
+
);
|
|
7002
|
+
}
|
|
6910
7003
|
}
|
|
6911
7004
|
}
|
|
6912
7005
|
})();
|
|
@@ -6914,6 +7007,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6914
7007
|
if (timeoutHandle) {
|
|
6915
7008
|
clearTimeout(timeoutHandle);
|
|
6916
7009
|
}
|
|
7010
|
+
if (heartbeatHandle) {
|
|
7011
|
+
clearInterval(heartbeatHandle);
|
|
7012
|
+
}
|
|
6917
7013
|
console.log(
|
|
6918
7014
|
"[executeWithClaudeCode] Claude Agent SDK query completed, received",
|
|
6919
7015
|
allMessages.length,
|
|
@@ -6923,6 +7019,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6923
7019
|
if (timeoutHandle) {
|
|
6924
7020
|
clearTimeout(timeoutHandle);
|
|
6925
7021
|
}
|
|
7022
|
+
if (heartbeatHandle) {
|
|
7023
|
+
clearInterval(heartbeatHandle);
|
|
7024
|
+
}
|
|
6926
7025
|
if (timedOut) {
|
|
6927
7026
|
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
6928
7027
|
}
|
|
@@ -7466,6 +7565,32 @@ function diffSnapshots(before, after) {
|
|
|
7466
7565
|
diffs.sort((a, b) => a.path.localeCompare(b.path));
|
|
7467
7566
|
return diffs;
|
|
7468
7567
|
}
|
|
7568
|
+
function extractTemplateFiles(before, after) {
|
|
7569
|
+
const files = [];
|
|
7570
|
+
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
7571
|
+
for (const path9 of allPaths) {
|
|
7572
|
+
const beforeContent = before[path9];
|
|
7573
|
+
const afterContent = after[path9];
|
|
7574
|
+
if (afterContent === void 0) {
|
|
7575
|
+
continue;
|
|
7576
|
+
}
|
|
7577
|
+
let status;
|
|
7578
|
+
if (beforeContent === void 0) {
|
|
7579
|
+
status = "new";
|
|
7580
|
+
} else if (beforeContent !== afterContent) {
|
|
7581
|
+
status = "modified";
|
|
7582
|
+
} else {
|
|
7583
|
+
status = "unchanged";
|
|
7584
|
+
}
|
|
7585
|
+
files.push({
|
|
7586
|
+
path: path9,
|
|
7587
|
+
content: afterContent,
|
|
7588
|
+
status
|
|
7589
|
+
});
|
|
7590
|
+
}
|
|
7591
|
+
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
7592
|
+
return files;
|
|
7593
|
+
}
|
|
7469
7594
|
|
|
7470
7595
|
// src/run-scenario/callSkill.ts
|
|
7471
7596
|
async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
@@ -7498,6 +7623,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7498
7623
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
7499
7624
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
7500
7625
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
7626
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
7501
7627
|
return {
|
|
7502
7628
|
id: randomUUID2(),
|
|
7503
7629
|
targetId: skill.id,
|
|
@@ -7508,6 +7634,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7508
7634
|
duration: result.durationMs,
|
|
7509
7635
|
outputText: result.outputText,
|
|
7510
7636
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|
|
7637
|
+
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
7511
7638
|
startedAt,
|
|
7512
7639
|
completedAt,
|
|
7513
7640
|
llmTrace
|