@wix/evalforge-evaluator 0.28.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +137 -10
- package/build/index.js.map +2 -2
- package/build/index.mjs +137 -10
- package/build/index.mjs.map +2 -2
- package/build/types/run-scenario/file-diff.d.ts +9 -1
- package/package.json +3 -3
package/build/index.js
CHANGED
|
@@ -6630,13 +6630,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6630
6630
|
let toolName;
|
|
6631
6631
|
let toolArgs;
|
|
6632
6632
|
let outputPreview;
|
|
6633
|
+
let filePath;
|
|
6633
6634
|
for (const block of message.message.content) {
|
|
6634
6635
|
if (block.type === "tool_use") {
|
|
6635
6636
|
type = import_evalforge_types.LiveTraceEventType.TOOL_USE;
|
|
6636
6637
|
toolName = block.name;
|
|
6637
|
-
toolArgs = JSON.stringify(block.input).slice(0,
|
|
6638
|
+
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
6639
|
+
const input = block.input;
|
|
6640
|
+
if (input.file_path || input.path || input.target_file) {
|
|
6641
|
+
filePath = String(input.file_path || input.path || input.target_file);
|
|
6642
|
+
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
6643
|
+
type = import_evalforge_types.LiveTraceEventType.FILE_WRITE;
|
|
6644
|
+
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
6645
|
+
type = import_evalforge_types.LiveTraceEventType.FILE_READ;
|
|
6646
|
+
}
|
|
6647
|
+
}
|
|
6638
6648
|
} else if (block.type === "text") {
|
|
6639
|
-
outputPreview = block.text.slice(0,
|
|
6649
|
+
outputPreview = block.text.slice(0, 500);
|
|
6640
6650
|
}
|
|
6641
6651
|
}
|
|
6642
6652
|
return {
|
|
@@ -6650,10 +6660,64 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6650
6660
|
toolName,
|
|
6651
6661
|
toolArgs,
|
|
6652
6662
|
outputPreview,
|
|
6663
|
+
filePath,
|
|
6653
6664
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6654
6665
|
isComplete
|
|
6655
6666
|
};
|
|
6656
6667
|
}
|
|
6668
|
+
function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
|
|
6669
|
+
const baseEvent = {
|
|
6670
|
+
evalRunId: context.evalRunId,
|
|
6671
|
+
scenarioId: context.scenarioId,
|
|
6672
|
+
scenarioName: context.scenarioName,
|
|
6673
|
+
targetId: context.targetId,
|
|
6674
|
+
targetName: context.targetName,
|
|
6675
|
+
stepNumber,
|
|
6676
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6677
|
+
isComplete
|
|
6678
|
+
};
|
|
6679
|
+
if (isAssistantMessage(message)) {
|
|
6680
|
+
return createTraceEventFromMessage(
|
|
6681
|
+
message,
|
|
6682
|
+
context,
|
|
6683
|
+
stepNumber,
|
|
6684
|
+
isComplete
|
|
6685
|
+
);
|
|
6686
|
+
}
|
|
6687
|
+
if (message.type === "user") {
|
|
6688
|
+
const userMsg = message;
|
|
6689
|
+
let outputPreview = "";
|
|
6690
|
+
if (userMsg.message?.content) {
|
|
6691
|
+
for (const block of userMsg.message.content) {
|
|
6692
|
+
if ("text" in block && block.text) {
|
|
6693
|
+
outputPreview = block.text.slice(0, 500);
|
|
6694
|
+
break;
|
|
6695
|
+
}
|
|
6696
|
+
}
|
|
6697
|
+
}
|
|
6698
|
+
return {
|
|
6699
|
+
...baseEvent,
|
|
6700
|
+
type: import_evalforge_types.LiveTraceEventType.USER,
|
|
6701
|
+
outputPreview: outputPreview || "(tool result)"
|
|
6702
|
+
};
|
|
6703
|
+
}
|
|
6704
|
+
if (message.type === "system") {
|
|
6705
|
+
const sysMsg = message;
|
|
6706
|
+
return {
|
|
6707
|
+
...baseEvent,
|
|
6708
|
+
type: import_evalforge_types.LiveTraceEventType.SYSTEM,
|
|
6709
|
+
outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
|
|
6710
|
+
};
|
|
6711
|
+
}
|
|
6712
|
+
if (message.type === "result") {
|
|
6713
|
+
return null;
|
|
6714
|
+
}
|
|
6715
|
+
return {
|
|
6716
|
+
...baseEvent,
|
|
6717
|
+
type: import_evalforge_types.LiveTraceEventType.PROGRESS,
|
|
6718
|
+
outputPreview: `Message type: ${message.type}`
|
|
6719
|
+
};
|
|
6720
|
+
}
|
|
6657
6721
|
async function executeWithClaudeCode(skill, scenario, options) {
|
|
6658
6722
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
6659
6723
|
skillId: skill.id,
|
|
@@ -6850,6 +6914,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6850
6914
|
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
6851
6915
|
let timeoutHandle;
|
|
6852
6916
|
let timedOut = false;
|
|
6917
|
+
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
6918
|
+
let heartbeatHandle;
|
|
6919
|
+
const executionStartTime = Date.now();
|
|
6853
6920
|
try {
|
|
6854
6921
|
const timeoutPromise = new Promise((_, reject) => {
|
|
6855
6922
|
timeoutHandle = setTimeout(() => {
|
|
@@ -6861,6 +6928,30 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6861
6928
|
);
|
|
6862
6929
|
}, SDK_TIMEOUT_MS);
|
|
6863
6930
|
});
|
|
6931
|
+
if (traceContext) {
|
|
6932
|
+
heartbeatHandle = setInterval(() => {
|
|
6933
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
6934
|
+
const progressEvent = {
|
|
6935
|
+
evalRunId: traceContext.evalRunId,
|
|
6936
|
+
scenarioId: traceContext.scenarioId,
|
|
6937
|
+
scenarioName: traceContext.scenarioName,
|
|
6938
|
+
targetId: traceContext.targetId,
|
|
6939
|
+
targetName: traceContext.targetName,
|
|
6940
|
+
stepNumber: traceStepNumber,
|
|
6941
|
+
type: import_evalforge_types.LiveTraceEventType.PROGRESS,
|
|
6942
|
+
outputPreview: `Executing... (${Math.round(elapsedMs / 1e3)}s elapsed, ${messageCount} messages)`,
|
|
6943
|
+
elapsedMs,
|
|
6944
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6945
|
+
isComplete: false
|
|
6946
|
+
};
|
|
6947
|
+
emitTraceEvent(
|
|
6948
|
+
progressEvent,
|
|
6949
|
+
traceContext.tracePushUrl,
|
|
6950
|
+
traceContext.routeHeader,
|
|
6951
|
+
traceContext.authToken
|
|
6952
|
+
);
|
|
6953
|
+
}, HEARTBEAT_INTERVAL_MS);
|
|
6954
|
+
}
|
|
6864
6955
|
const sdkPromise = (async () => {
|
|
6865
6956
|
const evaluatorPromptSuffix = `
|
|
6866
6957
|
|
|
@@ -6883,21 +6974,23 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6883
6974
|
})
|
|
6884
6975
|
);
|
|
6885
6976
|
}
|
|
6886
|
-
if (traceContext
|
|
6977
|
+
if (traceContext) {
|
|
6887
6978
|
traceStepNumber++;
|
|
6888
|
-
const traceEvent =
|
|
6979
|
+
const traceEvent = createTraceEventFromAnyMessage(
|
|
6889
6980
|
message,
|
|
6890
6981
|
traceContext,
|
|
6891
6982
|
traceStepNumber,
|
|
6892
6983
|
false
|
|
6893
6984
|
// Not complete yet
|
|
6894
6985
|
);
|
|
6895
|
-
|
|
6896
|
-
|
|
6897
|
-
|
|
6898
|
-
|
|
6899
|
-
|
|
6900
|
-
|
|
6986
|
+
if (traceEvent) {
|
|
6987
|
+
emitTraceEvent(
|
|
6988
|
+
traceEvent,
|
|
6989
|
+
traceContext.tracePushUrl,
|
|
6990
|
+
traceContext.routeHeader,
|
|
6991
|
+
traceContext.authToken
|
|
6992
|
+
);
|
|
6993
|
+
}
|
|
6901
6994
|
}
|
|
6902
6995
|
}
|
|
6903
6996
|
})();
|
|
@@ -6905,6 +6998,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6905
6998
|
if (timeoutHandle) {
|
|
6906
6999
|
clearTimeout(timeoutHandle);
|
|
6907
7000
|
}
|
|
7001
|
+
if (heartbeatHandle) {
|
|
7002
|
+
clearInterval(heartbeatHandle);
|
|
7003
|
+
}
|
|
6908
7004
|
console.log(
|
|
6909
7005
|
"[executeWithClaudeCode] Claude Agent SDK query completed, received",
|
|
6910
7006
|
allMessages.length,
|
|
@@ -6914,6 +7010,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
|
|
|
6914
7010
|
if (timeoutHandle) {
|
|
6915
7011
|
clearTimeout(timeoutHandle);
|
|
6916
7012
|
}
|
|
7013
|
+
if (heartbeatHandle) {
|
|
7014
|
+
clearInterval(heartbeatHandle);
|
|
7015
|
+
}
|
|
6917
7016
|
if (timedOut) {
|
|
6918
7017
|
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
6919
7018
|
}
|
|
@@ -7457,6 +7556,32 @@ function diffSnapshots(before, after) {
|
|
|
7457
7556
|
diffs.sort((a, b) => a.path.localeCompare(b.path));
|
|
7458
7557
|
return diffs;
|
|
7459
7558
|
}
|
|
7559
|
+
function extractTemplateFiles(before, after) {
|
|
7560
|
+
const files = [];
|
|
7561
|
+
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
7562
|
+
for (const path9 of allPaths) {
|
|
7563
|
+
const beforeContent = before[path9];
|
|
7564
|
+
const afterContent = after[path9];
|
|
7565
|
+
if (afterContent === void 0) {
|
|
7566
|
+
continue;
|
|
7567
|
+
}
|
|
7568
|
+
let status;
|
|
7569
|
+
if (beforeContent === void 0) {
|
|
7570
|
+
status = "new";
|
|
7571
|
+
} else if (beforeContent !== afterContent) {
|
|
7572
|
+
status = "modified";
|
|
7573
|
+
} else {
|
|
7574
|
+
status = "unchanged";
|
|
7575
|
+
}
|
|
7576
|
+
files.push({
|
|
7577
|
+
path: path9,
|
|
7578
|
+
content: afterContent,
|
|
7579
|
+
status
|
|
7580
|
+
});
|
|
7581
|
+
}
|
|
7582
|
+
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
7583
|
+
return files;
|
|
7584
|
+
}
|
|
7460
7585
|
|
|
7461
7586
|
// src/run-scenario/callSkill.ts
|
|
7462
7587
|
async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
@@ -7489,6 +7614,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7489
7614
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
7490
7615
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
7491
7616
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
7617
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
7492
7618
|
return {
|
|
7493
7619
|
id: (0, import_crypto2.randomUUID)(),
|
|
7494
7620
|
targetId: skill.id,
|
|
@@ -7499,6 +7625,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7499
7625
|
duration: result.durationMs,
|
|
7500
7626
|
outputText: result.outputText,
|
|
7501
7627
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|
|
7628
|
+
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
7502
7629
|
startedAt,
|
|
7503
7630
|
completedAt,
|
|
7504
7631
|
llmTrace
|