@wix/evalforge-evaluator 0.27.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +142 -11
- package/build/index.js.map +2 -2
- package/build/index.mjs +142 -11
- package/build/index.mjs.map +2 -2
- package/build/types/run-scenario/file-diff.d.ts +9 -1
- package/package.json +3 -3
package/build/index.js
CHANGED
|
@@ -6630,13 +6630,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6630
6630
|
let toolName;
|
|
6631
6631
|
let toolArgs;
|
|
6632
6632
|
let outputPreview;
|
|
6633
|
+
let filePath;
|
|
6633
6634
|
for (const block of message.message.content) {
|
|
6634
6635
|
if (block.type === "tool_use") {
|
|
6635
6636
|
type = import_evalforge_types.LiveTraceEventType.TOOL_USE;
|
|
6636
6637
|
toolName = block.name;
|
|
6637
|
-
toolArgs = JSON.stringify(block.input).slice(0,
|
|
6638
|
+
toolArgs = JSON.stringify(block.input).slice(0, 500);
|
|
6639
|
+
const input = block.input;
|
|
6640
|
+
if (input.file_path || input.path || input.target_file) {
|
|
6641
|
+
filePath = String(input.file_path || input.path || input.target_file);
|
|
6642
|
+
if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
|
|
6643
|
+
type = import_evalforge_types.LiveTraceEventType.FILE_WRITE;
|
|
6644
|
+
} else if (block.name === "Read" || block.name === "read" || block.name === "View") {
|
|
6645
|
+
type = import_evalforge_types.LiveTraceEventType.FILE_READ;
|
|
6646
|
+
}
|
|
6647
|
+
}
|
|
6638
6648
|
} else if (block.type === "text") {
|
|
6639
|
-
outputPreview = block.text.slice(0,
|
|
6649
|
+
outputPreview = block.text.slice(0, 500);
|
|
6640
6650
|
}
|
|
6641
6651
|
}
|
|
6642
6652
|
return {
|
|
@@ -6650,9 +6660,63 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
|
|
|
6650
6660
|
toolName,
|
|
6651
6661
|
toolArgs,
|
|
6652
6662
|
outputPreview,
|
|
6663
|
+
filePath,
|
|
6664
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6665
|
+
isComplete
|
|
6666
|
+
};
|
|
6667
|
+
}
|
|
6668
|
+
function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
|
|
6669
|
+
const baseEvent = {
|
|
6670
|
+
evalRunId: context.evalRunId,
|
|
6671
|
+
scenarioId: context.scenarioId,
|
|
6672
|
+
scenarioName: context.scenarioName,
|
|
6673
|
+
targetId: context.targetId,
|
|
6674
|
+
targetName: context.targetName,
|
|
6675
|
+
stepNumber,
|
|
6653
6676
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6654
6677
|
isComplete
|
|
6655
6678
|
};
|
|
6679
|
+
if (isAssistantMessage(message)) {
|
|
6680
|
+
return createTraceEventFromMessage(
|
|
6681
|
+
message,
|
|
6682
|
+
context,
|
|
6683
|
+
stepNumber,
|
|
6684
|
+
isComplete
|
|
6685
|
+
);
|
|
6686
|
+
}
|
|
6687
|
+
if (message.type === "user") {
|
|
6688
|
+
const userMsg = message;
|
|
6689
|
+
let outputPreview = "";
|
|
6690
|
+
if (userMsg.message?.content) {
|
|
6691
|
+
for (const block of userMsg.message.content) {
|
|
6692
|
+
if ("text" in block && block.text) {
|
|
6693
|
+
outputPreview = block.text.slice(0, 500);
|
|
6694
|
+
break;
|
|
6695
|
+
}
|
|
6696
|
+
}
|
|
6697
|
+
}
|
|
6698
|
+
return {
|
|
6699
|
+
...baseEvent,
|
|
6700
|
+
type: import_evalforge_types.LiveTraceEventType.USER,
|
|
6701
|
+
outputPreview: outputPreview || "(tool result)"
|
|
6702
|
+
};
|
|
6703
|
+
}
|
|
6704
|
+
if (message.type === "system") {
|
|
6705
|
+
const sysMsg = message;
|
|
6706
|
+
return {
|
|
6707
|
+
...baseEvent,
|
|
6708
|
+
type: import_evalforge_types.LiveTraceEventType.SYSTEM,
|
|
6709
|
+
outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
|
|
6710
|
+
};
|
|
6711
|
+
}
|
|
6712
|
+
if (message.type === "result") {
|
|
6713
|
+
return null;
|
|
6714
|
+
}
|
|
6715
|
+
return {
|
|
6716
|
+
...baseEvent,
|
|
6717
|
+
type: import_evalforge_types.LiveTraceEventType.PROGRESS,
|
|
6718
|
+
outputPreview: `Message type: ${message.type}`
|
|
6719
|
+
};
|
|
6656
6720
|
}
|
|
6657
6721
|
async function executeWithClaudeCode(skill, scenario, options) {
|
|
6658
6722
|
console.log("[executeWithClaudeCode] Starting execution", {
|
|
@@ -6850,6 +6914,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6850
6914
|
const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
|
|
6851
6915
|
let timeoutHandle;
|
|
6852
6916
|
let timedOut = false;
|
|
6917
|
+
const HEARTBEAT_INTERVAL_MS = 1e4;
|
|
6918
|
+
let heartbeatHandle;
|
|
6919
|
+
const executionStartTime = Date.now();
|
|
6853
6920
|
try {
|
|
6854
6921
|
const timeoutPromise = new Promise((_, reject) => {
|
|
6855
6922
|
timeoutHandle = setTimeout(() => {
|
|
@@ -6861,9 +6928,37 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6861
6928
|
);
|
|
6862
6929
|
}, SDK_TIMEOUT_MS);
|
|
6863
6930
|
});
|
|
6931
|
+
if (traceContext) {
|
|
6932
|
+
heartbeatHandle = setInterval(() => {
|
|
6933
|
+
const elapsedMs = Date.now() - executionStartTime;
|
|
6934
|
+
const progressEvent = {
|
|
6935
|
+
evalRunId: traceContext.evalRunId,
|
|
6936
|
+
scenarioId: traceContext.scenarioId,
|
|
6937
|
+
scenarioName: traceContext.scenarioName,
|
|
6938
|
+
targetId: traceContext.targetId,
|
|
6939
|
+
targetName: traceContext.targetName,
|
|
6940
|
+
stepNumber: traceStepNumber,
|
|
6941
|
+
type: import_evalforge_types.LiveTraceEventType.PROGRESS,
|
|
6942
|
+
outputPreview: `Executing... (${Math.round(elapsedMs / 1e3)}s elapsed, ${messageCount} messages)`,
|
|
6943
|
+
elapsedMs,
|
|
6944
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6945
|
+
isComplete: false
|
|
6946
|
+
};
|
|
6947
|
+
emitTraceEvent(
|
|
6948
|
+
progressEvent,
|
|
6949
|
+
traceContext.tracePushUrl,
|
|
6950
|
+
traceContext.routeHeader,
|
|
6951
|
+
traceContext.authToken
|
|
6952
|
+
);
|
|
6953
|
+
}, HEARTBEAT_INTERVAL_MS);
|
|
6954
|
+
}
|
|
6864
6955
|
const sdkPromise = (async () => {
|
|
6956
|
+
const evaluatorPromptSuffix = `
|
|
6957
|
+
|
|
6958
|
+
IMPORTANT: This is an automated evaluation run. Execute the requested changes immediately without asking for confirmation. Do not ask "would you like me to proceed?" or similar questions - just implement the solution directly.`;
|
|
6959
|
+
const fullPrompt = scenario.triggerPrompt + evaluatorPromptSuffix;
|
|
6865
6960
|
for await (const message of query({
|
|
6866
|
-
prompt:
|
|
6961
|
+
prompt: fullPrompt,
|
|
6867
6962
|
options: queryOptions
|
|
6868
6963
|
})) {
|
|
6869
6964
|
messageCount++;
|
|
@@ -6879,21 +6974,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6879
6974
|
})
|
|
6880
6975
|
);
|
|
6881
6976
|
}
|
|
6882
|
-
if (traceContext
|
|
6977
|
+
if (traceContext) {
|
|
6883
6978
|
traceStepNumber++;
|
|
6884
|
-
const traceEvent =
|
|
6979
|
+
const traceEvent = createTraceEventFromAnyMessage(
|
|
6885
6980
|
message,
|
|
6886
6981
|
traceContext,
|
|
6887
6982
|
traceStepNumber,
|
|
6888
6983
|
false
|
|
6889
6984
|
// Not complete yet
|
|
6890
6985
|
);
|
|
6891
|
-
|
|
6892
|
-
|
|
6893
|
-
|
|
6894
|
-
|
|
6895
|
-
|
|
6896
|
-
|
|
6986
|
+
if (traceEvent) {
|
|
6987
|
+
emitTraceEvent(
|
|
6988
|
+
traceEvent,
|
|
6989
|
+
traceContext.tracePushUrl,
|
|
6990
|
+
traceContext.routeHeader,
|
|
6991
|
+
traceContext.authToken
|
|
6992
|
+
);
|
|
6993
|
+
}
|
|
6897
6994
|
}
|
|
6898
6995
|
}
|
|
6899
6996
|
})();
|
|
@@ -6901,6 +6998,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6901
6998
|
if (timeoutHandle) {
|
|
6902
6999
|
clearTimeout(timeoutHandle);
|
|
6903
7000
|
}
|
|
7001
|
+
if (heartbeatHandle) {
|
|
7002
|
+
clearInterval(heartbeatHandle);
|
|
7003
|
+
}
|
|
6904
7004
|
console.log(
|
|
6905
7005
|
"[executeWithClaudeCode] Claude Agent SDK query completed, received",
|
|
6906
7006
|
allMessages.length,
|
|
@@ -6910,6 +7010,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
|
|
|
6910
7010
|
if (timeoutHandle) {
|
|
6911
7011
|
clearTimeout(timeoutHandle);
|
|
6912
7012
|
}
|
|
7013
|
+
if (heartbeatHandle) {
|
|
7014
|
+
clearInterval(heartbeatHandle);
|
|
7015
|
+
}
|
|
6913
7016
|
if (timedOut) {
|
|
6914
7017
|
console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
|
|
6915
7018
|
}
|
|
@@ -7453,6 +7556,32 @@ function diffSnapshots(before, after) {
|
|
|
7453
7556
|
diffs.sort((a, b) => a.path.localeCompare(b.path));
|
|
7454
7557
|
return diffs;
|
|
7455
7558
|
}
|
|
7559
|
+
function extractTemplateFiles(before, after) {
|
|
7560
|
+
const files = [];
|
|
7561
|
+
const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
|
|
7562
|
+
for (const path9 of allPaths) {
|
|
7563
|
+
const beforeContent = before[path9];
|
|
7564
|
+
const afterContent = after[path9];
|
|
7565
|
+
if (afterContent === void 0) {
|
|
7566
|
+
continue;
|
|
7567
|
+
}
|
|
7568
|
+
let status;
|
|
7569
|
+
if (beforeContent === void 0) {
|
|
7570
|
+
status = "new";
|
|
7571
|
+
} else if (beforeContent !== afterContent) {
|
|
7572
|
+
status = "modified";
|
|
7573
|
+
} else {
|
|
7574
|
+
status = "unchanged";
|
|
7575
|
+
}
|
|
7576
|
+
files.push({
|
|
7577
|
+
path: path9,
|
|
7578
|
+
content: afterContent,
|
|
7579
|
+
status
|
|
7580
|
+
});
|
|
7581
|
+
}
|
|
7582
|
+
files.sort((a, b) => a.path.localeCompare(b.path));
|
|
7583
|
+
return files;
|
|
7584
|
+
}
|
|
7456
7585
|
|
|
7457
7586
|
// src/run-scenario/callSkill.ts
|
|
7458
7587
|
async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
@@ -7485,6 +7614,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7485
7614
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
7486
7615
|
const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
|
|
7487
7616
|
const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
|
|
7617
|
+
const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
|
|
7488
7618
|
return {
|
|
7489
7619
|
id: (0, import_crypto2.randomUUID)(),
|
|
7490
7620
|
targetId: skill.id,
|
|
@@ -7495,6 +7625,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
|
|
|
7495
7625
|
duration: result.durationMs,
|
|
7496
7626
|
outputText: result.outputText,
|
|
7497
7627
|
fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
|
|
7628
|
+
templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
|
|
7498
7629
|
startedAt,
|
|
7499
7630
|
completedAt,
|
|
7500
7631
|
llmTrace
|