@wix/evalforge-evaluator 0.28.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +162 -10
- package/build/index.js.map +2 -2
- package/build/index.mjs +162 -10
- package/build/index.mjs.map +2 -2
- package/build/types/run-scenario/file-diff.d.ts +9 -1
- package/package.json +3 -3
package/build/index.js
CHANGED
|
@@ -6630,13 +6630,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6630
6630
|
let toolName;
|
|
6631
6631
|
let toolArgs;
|
|
6632
6632
|
let outputPreview;
|
|
6633
|
+
let filePath;
|
|
6633
6634
|
for (const block of message.message.content) {
|
|
6634
6635
|
if (block.type === "tool_use") {
|
|
6635
6636
|
type = import_evalforge_types.LiveTraceEventType.TOOL_USE;
|
|
6636
6637
|
toolName = block.name;
|
|
6637
|
-
toolArgs = JSON.stringify(block.input).slice(0,
|
|
6638
|
+
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
6639
|
+
const input = block.input;
|
|
6640
|
+
if (input.file_path || input.path || input.target_file) {
|
|
6641
|
+
filePath = String(input.file_path || input.path || input.target_file);
|
|
6642
|
+
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
6643
|
+
type = import_evalforge_types.LiveTraceEventType.FILE_WRITE;
|
|
6644
|
+
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
6645
|
+
type = import_evalforge_types.LiveTraceEventType.FILE_READ;
|
|
6646
|
+
}
|
|
6647
|
+
}
|
|
6638
6648
|
} else if (block.type === "text") {
|
|
6639
|
-
outputPreview = block.text.slice(0,
|
|
6649
|
+
outputPreview = block.text.slice(0, 500);
|
|
6640
6650
|
}
|
|
6641
6651
|
}
|
|
6642
6652
|
return {
|
|
@@ -6650,9 +6660,63 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6650
6660
|
toolName,
|
|
6651
6661
|
toolArgs,
|
|
6652
6662
|
outputPreview,
|
|
6663
|
+
filePath,
|
|
6664
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6665
|
+
isComplete
|
|
6666
|
+
};
|
|
6667
|
+
}
|
|
6668
|
+
function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
|
|
6669
|
+
const baseEvent = {
|
|
6670
|
+
evalRunId: context.evalRunId,
|
|
6671
|
+
scenarioId: context.scenarioId,
|
|
6672
|
+
scenarioName: context.scenarioName,
|
|
6673
|
+
targetId: context.targetId,
|
|
6674
|
+
targetName: context.targetName,
|
|
6675
|
+
stepNumber,
|
|
6653
6676
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6654
6677
|
isComplete
|
|
6655
6678
|
};
|
|
6679
|
+
if (isAssistantMessage(message)) {
|
|
6680
|
+
return createTraceEventFromMessage(
|
|
6681
|
+
message,
|
|
6682
|
+
context,
|
|
6683
|
+
stepNumber,
|
|
6684
|
+
isComplete
|
|
6685
|
+
);
|
|
6686
|
+
}
|
|
6687
|
+
if (message.type === "user") {
|
|
6688
|
+
const userMsg = message;
|
|
6689
|
+
let outputPreview = "";
|
|
6690
|
+
if (userMsg.message?.content) {
|
|
6691
|
+
for (const block of userMsg.message.content) {
|
|
6692
|
+
if ("text" in block && block.text) {
|
|
6693
|
+
outputPreview = block.text.slice(0, 500);
|
|
6694
|
+
break;
|
|
6695
|
+
}
|
|
6696
|
+
}
|
|
6697
|
+
}
|
|
6698
|
+
return {
|
|
6699
|
+
...baseEvent,
|
|
6700
|
+
type: import_evalforge_types.LiveTraceEventType.USER,
|
|
6701
|
+
outputPreview: outputPreview || "(tool result)"
|
|
6702
|
+
};
|
|
6703
|
+
}
|
|
6704
|
+
if (message.type === "system") {
|
|
6705
|
+
const sysMsg = message;
|
|
6706
|
+
return {
|
|
6707
|
+
...baseEvent,
|
|
6708
|
+
type: import_evalforge_types.LiveTraceEventType.SYSTEM,
|
|
6709
|
+
outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
|
|
6710
|
+
};
|
|
6711
|
+
}
|
|
6712
|
+
if (message.type === "result") {
|
|
6713
|
+
return null;
|
|
6714
|
+
}
|
|
6715
|
+
return {
|
|
6716
|
+
...baseEvent,
|
|
6717
|
+
type: import_evalforge_types.LiveTraceEventType.PROGRESS,
|
|
6718
|
+
outputPreview: `Message type: ${message.type}`
|
|
6719
|
+
};
|
|
6656
6720
|
}
|
|
6657
6721
|
async function executeWithClaudeCode(skill, scenario, options) {
|
|
6658
6722
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
@@ -6761,6 +6825,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6761
6825
|
console.log("[SDK-DEBUG] ============================================");
|
|
6762
6826
|
let traceStepNumber = 0;
|
|
6763
6827
|
const traceContext = options.traceContext;
|
|
6828
|
+
let lastAction = "Starting...";
|
|
6829
|
+
let lastToolName;
|
|
6830
|
+
let lastFilePath;
|
|
6764
6831
|
const maxTurns = options.maxTurns ?? 10;
|
|
6765
6832
|
console.error(
|
|
6766
6833
|
"[DEBUG-H5] Claude SDK query START",
|
|
@@ -6850,6 +6917,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6850
6917
|
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
6851
6918
|
let timeoutHandle;
|
|
6852
6919
|
let timedOut = false;
|
|
6920
|
+
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
6921
|
+
let heartbeatHandle;
|
|
6922
|
+
const executionStartTime = Date.now();
|
|
6853
6923
|
try {
|
|
6854
6924
|
const timeoutPromise = new Promise((_, reject) => {
|
|
6855
6925
|
timeoutHandle = setTimeout(() => {
|
|
@@ -6861,6 +6931,39 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6861
6931
|
);
|
|
6862
6932
|
}, SDK_TIMEOUT_MS);
|
|
6863
6933
|
});
|
|
6934
|
+
if (traceContext) {
|
|
6935
|
+
heartbeatHandle = setInterval(() => {
|
|
6936
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
6937
|
+
let progressMessage = lastAction;
|
|
6938
|
+
if (lastToolName && lastFilePath) {
|
|
6939
|
+
progressMessage = `${lastToolName}: ${lastFilePath}`;
|
|
6940
|
+
} else if (lastToolName) {
|
|
6941
|
+
progressMessage = `Using ${lastToolName}...`;
|
|
6942
|
+
}
|
|
6943
|
+
progressMessage += ` (${Math.round(elapsedMs / 1e3)}s)`;
|
|
6944
|
+
const progressEvent = {
|
|
6945
|
+
evalRunId: traceContext.evalRunId,
|
|
6946
|
+
scenarioId: traceContext.scenarioId,
|
|
6947
|
+
scenarioName: traceContext.scenarioName,
|
|
6948
|
+
targetId: traceContext.targetId,
|
|
6949
|
+
targetName: traceContext.targetName,
|
|
6950
|
+
stepNumber: traceStepNumber,
|
|
6951
|
+
type: import_evalforge_types.LiveTraceEventType.PROGRESS,
|
|
6952
|
+
outputPreview: progressMessage,
|
|
6953
|
+
toolName: lastToolName,
|
|
6954
|
+
filePath: lastFilePath,
|
|
6955
|
+
elapsedMs,
|
|
6956
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6957
|
+
isComplete: false
|
|
6958
|
+
};
|
|
6959
|
+
emitTraceEvent(
|
|
6960
|
+
progressEvent,
|
|
6961
|
+
traceContext.tracePushUrl,
|
|
6962
|
+
traceContext.routeHeader,
|
|
6963
|
+
traceContext.authToken
|
|
6964
|
+
);
|
|
6965
|
+
}, HEARTBEAT_INTERVAL_MS);
|
|
6966
|
+
}
|
|
6864
6967
|
const sdkPromise = (async () => {
|
|
6865
6968
|
const evaluatorPromptSuffix = `
|
|
6866
6969
|
|
|
@@ -6883,21 +6986,36 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6883
6986
|
})
|
|
6884
6987
|
);
|
|
6885
6988
|
}
|
|
6886
|
-
if (traceContext
|
|
6989
|
+
if (traceContext) {
|
|
6887
6990
|
traceStepNumber++;
|
|
6888
|
-
const traceEvent =
|
|
6991
|
+
const traceEvent = createTraceEventFromAnyMessage(
|
|
6889
6992
|
message,
|
|
6890
6993
|
traceContext,
|
|
6891
6994
|
traceStepNumber,
|
|
6892
6995
|
false
|
|
6893
6996
|
// Not complete yet
|
|
6894
6997
|
);
|
|
6895
|
-
|
|
6896
|
-
traceEvent
|
|
6897
|
-
|
|
6898
|
-
|
|
6899
|
-
|
|
6900
|
-
|
|
6998
|
+
if (traceEvent) {
|
|
6999
|
+
lastToolName = traceEvent.toolName;
|
|
7000
|
+
lastFilePath = traceEvent.filePath;
|
|
7001
|
+
if (traceEvent.type === import_evalforge_types.LiveTraceEventType.THINKING) {
|
|
7002
|
+
lastAction = "Thinking...";
|
|
7003
|
+
} else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.TOOL_USE) {
|
|
7004
|
+
lastAction = `Using ${traceEvent.toolName || "tool"}...`;
|
|
7005
|
+
} else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.FILE_WRITE) {
|
|
7006
|
+
lastAction = `Writing: ${traceEvent.filePath || "file"}`;
|
|
7007
|
+
} else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.FILE_READ) {
|
|
7008
|
+
lastAction = `Reading: ${traceEvent.filePath || "file"}`;
|
|
7009
|
+
} else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.COMPLETION) {
|
|
7010
|
+
lastAction = "Processing response...";
|
|
7011
|
+
}
|
|
7012
|
+
emitTraceEvent(
|
|
7013
|
+
traceEvent,
|
|
7014
|
+
traceContext.tracePushUrl,
|
|
7015
|
+
traceContext.routeHeader,
|
|
7016
|
+
traceContext.authToken
|
|
7017
|
+
);
|
|
7018
|
+
}
|
|
6901
7019
|
}
|
|
6902
7020
|
}
|
|
6903
7021
|
})();
|
|
@@ -6905,6 +7023,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6905
7023
|
if (timeoutHandle) {
|
|
6906
7024
|
clearTimeout(timeoutHandle);
|
|
6907
7025
|
}
|
|
7026
|
+
if (heartbeatHandle) {
|
|
7027
|
+
clearInterval(heartbeatHandle);
|
|
7028
|
+
}
|
|
6908
7029
|
console.log(
|
|
6909
7030
|
"[executeWithClaudeCode] Claude Agent SDK query completed, received",
|
|
6910
7031
|
allMessages.length,
|
|
@@ -6914,6 +7035,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6914
7035
|
if (timeoutHandle) {
|
|
6915
7036
|
clearTimeout(timeoutHandle);
|
|
6916
7037
|
}
|
|
7038
|
+
if (heartbeatHandle) {
|
|
7039
|
+
clearInterval(heartbeatHandle);
|
|
7040
|
+
}
|
|
6917
7041
|
if (timedOut) {
|
|
6918
7042
|
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
6919
7043
|
}
|
|
@@ -7457,6 +7581,32 @@ function diffSnapshots(before, after) {
|
|
|
7457
7581
|
diffs.sort((a, b) => a.path.localeCompare(b.path));
|
|
7458
7582
|
return diffs;
|
|
7459
7583
|
}
|
|
7584
|
+
function extractTemplateFiles(before, after) {
|
|
7585
|
+
const files = [];
|
|
7586
|
+
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
7587
|
+
for (const path9 of allPaths) {
|
|
7588
|
+
const beforeContent = before[path9];
|
|
7589
|
+
const afterContent = after[path9];
|
|
7590
|
+
if (afterContent === void 0) {
|
|
7591
|
+
continue;
|
|
7592
|
+
}
|
|
7593
|
+
let status;
|
|
7594
|
+
if (beforeContent === void 0) {
|
|
7595
|
+
status = "new";
|
|
7596
|
+
} else if (beforeContent !== afterContent) {
|
|
7597
|
+
status = "modified";
|
|
7598
|
+
} else {
|
|
7599
|
+
status = "unchanged";
|
|
7600
|
+
}
|
|
7601
|
+
files.push({
|
|
7602
|
+
path: path9,
|
|
7603
|
+
content: afterContent,
|
|
7604
|
+
status
|
|
7605
|
+
});
|
|
7606
|
+
}
|
|
7607
|
+
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
7608
|
+
return files;
|
|
7609
|
+
}
|
|
7460
7610
|
|
|
7461
7611
|
// src/run-scenario/callSkill.ts
|
|
7462
7612
|
async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
@@ -7489,6 +7639,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7489
7639
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
7490
7640
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
7491
7641
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
7642
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
7492
7643
|
return {
|
|
7493
7644
|
id: (0, import_crypto2.randomUUID)(),
|
|
7494
7645
|
targetId: skill.id,
|
|
@@ -7499,6 +7650,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7499
7650
|
duration: result.durationMs,
|
|
7500
7651
|
outputText: result.outputText,
|
|
7501
7652
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|
|
7653
|
+
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
7502
7654
|
startedAt,
|
|
7503
7655
|
completedAt,
|
|
7504
7656
|
llmTrace
|