@wix/evalforge-evaluator 0.28.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -6639,13 +6639,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6639
6639
  let toolName;
6640
6640
  let toolArgs;
6641
6641
  let outputPreview;
6642
+ let filePath;
6642
6643
  for (const block of message.message.content) {
6643
6644
  if (block.type === "tool_use") {
6644
6645
  type = LiveTraceEventType.TOOL_USE;
6645
6646
  toolName = block.name;
6646
- toolArgs = JSON.stringify(block.input).slice(0, 200);
6647
+ toolArgs = JSON.stringify(block.input).slice(0, 500);
6648
+ const input = block.input;
6649
+ if (input.file_path || input.path || input.target_file) {
6650
+ filePath = String(input.file_path || input.path || input.target_file);
6651
+ if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
6652
+ type = LiveTraceEventType.FILE_WRITE;
6653
+ } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
6654
+ type = LiveTraceEventType.FILE_READ;
6655
+ }
6656
+ }
6647
6657
  } else if (block.type === "text") {
6648
- outputPreview = block.text.slice(0, 200);
6658
+ outputPreview = block.text.slice(0, 500);
6649
6659
  }
6650
6660
  }
6651
6661
  return {
@@ -6659,10 +6669,64 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6659
6669
  toolName,
6660
6670
  toolArgs,
6661
6671
  outputPreview,
6672
+ filePath,
6662
6673
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6663
6674
  isComplete
6664
6675
  };
6665
6676
  }
6677
+ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
6678
+ const baseEvent = {
6679
+ evalRunId: context.evalRunId,
6680
+ scenarioId: context.scenarioId,
6681
+ scenarioName: context.scenarioName,
6682
+ targetId: context.targetId,
6683
+ targetName: context.targetName,
6684
+ stepNumber,
6685
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6686
+ isComplete
6687
+ };
6688
+ if (isAssistantMessage(message)) {
6689
+ return createTraceEventFromMessage(
6690
+ message,
6691
+ context,
6692
+ stepNumber,
6693
+ isComplete
6694
+ );
6695
+ }
6696
+ if (message.type === "user") {
6697
+ const userMsg = message;
6698
+ let outputPreview = "";
6699
+ if (userMsg.message?.content) {
6700
+ for (const block of userMsg.message.content) {
6701
+ if ("text" in block && block.text) {
6702
+ outputPreview = block.text.slice(0, 500);
6703
+ break;
6704
+ }
6705
+ }
6706
+ }
6707
+ return {
6708
+ ...baseEvent,
6709
+ type: LiveTraceEventType.USER,
6710
+ outputPreview: outputPreview || "(tool result)"
6711
+ };
6712
+ }
6713
+ if (message.type === "system") {
6714
+ const sysMsg = message;
6715
+ return {
6716
+ ...baseEvent,
6717
+ type: LiveTraceEventType.SYSTEM,
6718
+ outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
6719
+ };
6720
+ }
6721
+ if (message.type === "result") {
6722
+ return null;
6723
+ }
6724
+ return {
6725
+ ...baseEvent,
6726
+ type: LiveTraceEventType.PROGRESS,
6727
+ outputPreview: `Message type: ${message.type}`
6728
+ };
6729
+ }
6666
6730
  async function executeWithClaudeCode(skill, scenario, options) {
6667
6731
  console.log("[executeWithClaudeCode] Starting execution", {
6668
6732
  skillId: skill.id,
@@ -6859,6 +6923,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
6859
6923
  const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6860
6924
  let timeoutHandle;
6861
6925
  let timedOut = false;
6926
+ const HEARTBEAT_INTERVAL_MS = 1e4;
6927
+ let heartbeatHandle;
6928
+ const executionStartTime = Date.now();
6862
6929
  try {
6863
6930
  const timeoutPromise = new Promise((_, reject) => {
6864
6931
  timeoutHandle = setTimeout(() => {
@@ -6870,6 +6937,30 @@ async function executeWithClaudeCode(skill, scenario, options) {
6870
6937
  );
6871
6938
  }, SDK_TIMEOUT_MS);
6872
6939
  });
6940
+ if (traceContext) {
6941
+ heartbeatHandle = setInterval(() => {
6942
+ const elapsedMs = Date.now() - executionStartTime;
6943
+ const progressEvent = {
6944
+ evalRunId: traceContext.evalRunId,
6945
+ scenarioId: traceContext.scenarioId,
6946
+ scenarioName: traceContext.scenarioName,
6947
+ targetId: traceContext.targetId,
6948
+ targetName: traceContext.targetName,
6949
+ stepNumber: traceStepNumber,
6950
+ type: LiveTraceEventType.PROGRESS,
6951
+ outputPreview: `Executing... (${Math.round(elapsedMs / 1e3)}s elapsed, ${messageCount} messages)`,
6952
+ elapsedMs,
6953
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6954
+ isComplete: false
6955
+ };
6956
+ emitTraceEvent(
6957
+ progressEvent,
6958
+ traceContext.tracePushUrl,
6959
+ traceContext.routeHeader,
6960
+ traceContext.authToken
6961
+ );
6962
+ }, HEARTBEAT_INTERVAL_MS);
6963
+ }
6873
6964
  const sdkPromise = (async () => {
6874
6965
  const evaluatorPromptSuffix = `
6875
6966
 
@@ -6892,21 +6983,23 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6892
6983
  })
6893
6984
  );
6894
6985
  }
6895
- if (traceContext && isAssistantMessage(message)) {
6986
+ if (traceContext) {
6896
6987
  traceStepNumber++;
6897
- const traceEvent = createTraceEventFromMessage(
6988
+ const traceEvent = createTraceEventFromAnyMessage(
6898
6989
  message,
6899
6990
  traceContext,
6900
6991
  traceStepNumber,
6901
6992
  false
6902
6993
  // Not complete yet
6903
6994
  );
6904
- emitTraceEvent(
6905
- traceEvent,
6906
- traceContext.tracePushUrl,
6907
- traceContext.routeHeader,
6908
- traceContext.authToken
6909
- );
6995
+ if (traceEvent) {
6996
+ emitTraceEvent(
6997
+ traceEvent,
6998
+ traceContext.tracePushUrl,
6999
+ traceContext.routeHeader,
7000
+ traceContext.authToken
7001
+ );
7002
+ }
6910
7003
  }
6911
7004
  }
6912
7005
  })();
@@ -6914,6 +7007,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6914
7007
  if (timeoutHandle) {
6915
7008
  clearTimeout(timeoutHandle);
6916
7009
  }
7010
+ if (heartbeatHandle) {
7011
+ clearInterval(heartbeatHandle);
7012
+ }
6917
7013
  console.log(
6918
7014
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6919
7015
  allMessages.length,
@@ -6923,6 +7019,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6923
7019
  if (timeoutHandle) {
6924
7020
  clearTimeout(timeoutHandle);
6925
7021
  }
7022
+ if (heartbeatHandle) {
7023
+ clearInterval(heartbeatHandle);
7024
+ }
6926
7025
  if (timedOut) {
6927
7026
  console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6928
7027
  }
@@ -7466,6 +7565,32 @@ function diffSnapshots(before, after) {
7466
7565
  diffs.sort((a, b) => a.path.localeCompare(b.path));
7467
7566
  return diffs;
7468
7567
  }
7568
+ function extractTemplateFiles(before, after) {
7569
+ const files = [];
7570
+ const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
7571
+ for (const path9 of allPaths) {
7572
+ const beforeContent = before[path9];
7573
+ const afterContent = after[path9];
7574
+ if (afterContent === void 0) {
7575
+ continue;
7576
+ }
7577
+ let status;
7578
+ if (beforeContent === void 0) {
7579
+ status = "new";
7580
+ } else if (beforeContent !== afterContent) {
7581
+ status = "modified";
7582
+ } else {
7583
+ status = "unchanged";
7584
+ }
7585
+ files.push({
7586
+ path: path9,
7587
+ content: afterContent,
7588
+ status
7589
+ });
7590
+ }
7591
+ files.sort((a, b) => a.path.localeCompare(b.path));
7592
+ return files;
7593
+ }
7469
7594
 
7470
7595
  // src/run-scenario/callSkill.ts
7471
7596
  async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
@@ -7498,6 +7623,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7498
7623
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7499
7624
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
7500
7625
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
7626
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
7501
7627
  return {
7502
7628
  id: randomUUID2(),
7503
7629
  targetId: skill.id,
@@ -7508,6 +7634,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7508
7634
  duration: result.durationMs,
7509
7635
  outputText: result.outputText,
7510
7636
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
7637
+ templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
7511
7638
  startedAt,
7512
7639
  completedAt,
7513
7640
  llmTrace