@wix/evalforge-evaluator 0.28.0 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -6630,13 +6630,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6630
6630
  let toolName;
6631
6631
  let toolArgs;
6632
6632
  let outputPreview;
6633
+ let filePath;
6633
6634
  for (const block of message.message.content) {
6634
6635
  if (block.type === "tool_use") {
6635
6636
  type = import_evalforge_types.LiveTraceEventType.TOOL_USE;
6636
6637
  toolName = block.name;
6637
- toolArgs = JSON.stringify(block.input).slice(0, 200);
6638
+ toolArgs = JSON.stringify(block.input).slice(0, 500);
6639
+ const input = block.input;
6640
+ if (input.file_path || input.path || input.target_file) {
6641
+ filePath = String(input.file_path || input.path || input.target_file);
6642
+ if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
6643
+ type = import_evalforge_types.LiveTraceEventType.FILE_WRITE;
6644
+ } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
6645
+ type = import_evalforge_types.LiveTraceEventType.FILE_READ;
6646
+ }
6647
+ }
6638
6648
  } else if (block.type === "text") {
6639
- outputPreview = block.text.slice(0, 200);
6649
+ outputPreview = block.text.slice(0, 500);
6640
6650
  }
6641
6651
  }
6642
6652
  return {
@@ -6650,9 +6660,63 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6650
6660
  toolName,
6651
6661
  toolArgs,
6652
6662
  outputPreview,
6663
+ filePath,
6664
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6665
+ isComplete
6666
+ };
6667
+ }
6668
+ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
6669
+ const baseEvent = {
6670
+ evalRunId: context.evalRunId,
6671
+ scenarioId: context.scenarioId,
6672
+ scenarioName: context.scenarioName,
6673
+ targetId: context.targetId,
6674
+ targetName: context.targetName,
6675
+ stepNumber,
6653
6676
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6654
6677
  isComplete
6655
6678
  };
6679
+ if (isAssistantMessage(message)) {
6680
+ return createTraceEventFromMessage(
6681
+ message,
6682
+ context,
6683
+ stepNumber,
6684
+ isComplete
6685
+ );
6686
+ }
6687
+ if (message.type === "user") {
6688
+ const userMsg = message;
6689
+ let outputPreview = "";
6690
+ if (userMsg.message?.content) {
6691
+ for (const block of userMsg.message.content) {
6692
+ if ("text" in block && block.text) {
6693
+ outputPreview = block.text.slice(0, 500);
6694
+ break;
6695
+ }
6696
+ }
6697
+ }
6698
+ return {
6699
+ ...baseEvent,
6700
+ type: import_evalforge_types.LiveTraceEventType.USER,
6701
+ outputPreview: outputPreview || "(tool result)"
6702
+ };
6703
+ }
6704
+ if (message.type === "system") {
6705
+ const sysMsg = message;
6706
+ return {
6707
+ ...baseEvent,
6708
+ type: import_evalforge_types.LiveTraceEventType.SYSTEM,
6709
+ outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
6710
+ };
6711
+ }
6712
+ if (message.type === "result") {
6713
+ return null;
6714
+ }
6715
+ return {
6716
+ ...baseEvent,
6717
+ type: import_evalforge_types.LiveTraceEventType.PROGRESS,
6718
+ outputPreview: `Message type: ${message.type}`
6719
+ };
6656
6720
  }
6657
6721
  async function executeWithClaudeCode(skill, scenario, options) {
6658
6722
  console.log("[executeWithClaudeCode] Starting execution", {
@@ -6761,6 +6825,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
6761
6825
  console.log("[SDK-DEBUG] ============================================");
6762
6826
  let traceStepNumber = 0;
6763
6827
  const traceContext = options.traceContext;
6828
+ let lastAction = "Starting...";
6829
+ let lastToolName;
6830
+ let lastFilePath;
6764
6831
  const maxTurns = options.maxTurns ?? 10;
6765
6832
  console.error(
6766
6833
  "[DEBUG-H5] Claude SDK query START",
@@ -6850,6 +6917,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
6850
6917
  const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6851
6918
  let timeoutHandle;
6852
6919
  let timedOut = false;
6920
+ const HEARTBEAT_INTERVAL_MS = 1e4;
6921
+ let heartbeatHandle;
6922
+ const executionStartTime = Date.now();
6853
6923
  try {
6854
6924
  const timeoutPromise = new Promise((_, reject) => {
6855
6925
  timeoutHandle = setTimeout(() => {
@@ -6861,6 +6931,39 @@ async function executeWithClaudeCode(skill, scenario, options) {
6861
6931
  );
6862
6932
  }, SDK_TIMEOUT_MS);
6863
6933
  });
6934
+ if (traceContext) {
6935
+ heartbeatHandle = setInterval(() => {
6936
+ const elapsedMs = Date.now() - executionStartTime;
6937
+ let progressMessage = lastAction;
6938
+ if (lastToolName && lastFilePath) {
6939
+ progressMessage = `${lastToolName}: ${lastFilePath}`;
6940
+ } else if (lastToolName) {
6941
+ progressMessage = `Using ${lastToolName}...`;
6942
+ }
6943
+ progressMessage += ` (${Math.round(elapsedMs / 1e3)}s)`;
6944
+ const progressEvent = {
6945
+ evalRunId: traceContext.evalRunId,
6946
+ scenarioId: traceContext.scenarioId,
6947
+ scenarioName: traceContext.scenarioName,
6948
+ targetId: traceContext.targetId,
6949
+ targetName: traceContext.targetName,
6950
+ stepNumber: traceStepNumber,
6951
+ type: import_evalforge_types.LiveTraceEventType.PROGRESS,
6952
+ outputPreview: progressMessage,
6953
+ toolName: lastToolName,
6954
+ filePath: lastFilePath,
6955
+ elapsedMs,
6956
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6957
+ isComplete: false
6958
+ };
6959
+ emitTraceEvent(
6960
+ progressEvent,
6961
+ traceContext.tracePushUrl,
6962
+ traceContext.routeHeader,
6963
+ traceContext.authToken
6964
+ );
6965
+ }, HEARTBEAT_INTERVAL_MS);
6966
+ }
6864
6967
  const sdkPromise = (async () => {
6865
6968
  const evaluatorPromptSuffix = `
6866
6969
 
@@ -6883,21 +6986,36 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6883
6986
  })
6884
6987
  );
6885
6988
  }
6886
- if (traceContext && isAssistantMessage(message)) {
6989
+ if (traceContext) {
6887
6990
  traceStepNumber++;
6888
- const traceEvent = createTraceEventFromMessage(
6991
+ const traceEvent = createTraceEventFromAnyMessage(
6889
6992
  message,
6890
6993
  traceContext,
6891
6994
  traceStepNumber,
6892
6995
  false
6893
6996
  // Not complete yet
6894
6997
  );
6895
- emitTraceEvent(
6896
- traceEvent,
6897
- traceContext.tracePushUrl,
6898
- traceContext.routeHeader,
6899
- traceContext.authToken
6900
- );
6998
+ if (traceEvent) {
6999
+ lastToolName = traceEvent.toolName;
7000
+ lastFilePath = traceEvent.filePath;
7001
+ if (traceEvent.type === import_evalforge_types.LiveTraceEventType.THINKING) {
7002
+ lastAction = "Thinking...";
7003
+ } else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.TOOL_USE) {
7004
+ lastAction = `Using ${traceEvent.toolName || "tool"}...`;
7005
+ } else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.FILE_WRITE) {
7006
+ lastAction = `Writing: ${traceEvent.filePath || "file"}`;
7007
+ } else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.FILE_READ) {
7008
+ lastAction = `Reading: ${traceEvent.filePath || "file"}`;
7009
+ } else if (traceEvent.type === import_evalforge_types.LiveTraceEventType.COMPLETION) {
7010
+ lastAction = "Processing response...";
7011
+ }
7012
+ emitTraceEvent(
7013
+ traceEvent,
7014
+ traceContext.tracePushUrl,
7015
+ traceContext.routeHeader,
7016
+ traceContext.authToken
7017
+ );
7018
+ }
6901
7019
  }
6902
7020
  }
6903
7021
  })();
@@ -6905,6 +7023,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6905
7023
  if (timeoutHandle) {
6906
7024
  clearTimeout(timeoutHandle);
6907
7025
  }
7026
+ if (heartbeatHandle) {
7027
+ clearInterval(heartbeatHandle);
7028
+ }
6908
7029
  console.log(
6909
7030
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6910
7031
  allMessages.length,
@@ -6914,6 +7035,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6914
7035
  if (timeoutHandle) {
6915
7036
  clearTimeout(timeoutHandle);
6916
7037
  }
7038
+ if (heartbeatHandle) {
7039
+ clearInterval(heartbeatHandle);
7040
+ }
6917
7041
  if (timedOut) {
6918
7042
  console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6919
7043
  }
@@ -7457,6 +7581,32 @@ function diffSnapshots(before, after) {
7457
7581
  diffs.sort((a, b) => a.path.localeCompare(b.path));
7458
7582
  return diffs;
7459
7583
  }
7584
+ function extractTemplateFiles(before, after) {
7585
+ const files = [];
7586
+ const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
7587
+ for (const path9 of allPaths) {
7588
+ const beforeContent = before[path9];
7589
+ const afterContent = after[path9];
7590
+ if (afterContent === void 0) {
7591
+ continue;
7592
+ }
7593
+ let status;
7594
+ if (beforeContent === void 0) {
7595
+ status = "new";
7596
+ } else if (beforeContent !== afterContent) {
7597
+ status = "modified";
7598
+ } else {
7599
+ status = "unchanged";
7600
+ }
7601
+ files.push({
7602
+ path: path9,
7603
+ content: afterContent,
7604
+ status
7605
+ });
7606
+ }
7607
+ files.sort((a, b) => a.path.localeCompare(b.path));
7608
+ return files;
7609
+ }
7460
7610
 
7461
7611
  // src/run-scenario/callSkill.ts
7462
7612
  async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
@@ -7489,6 +7639,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7489
7639
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7490
7640
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
7491
7641
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
7642
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
7492
7643
  return {
7493
7644
  id: (0, import_crypto2.randomUUID)(),
7494
7645
  targetId: skill.id,
@@ -7499,6 +7650,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7499
7650
  duration: result.durationMs,
7500
7651
  outputText: result.outputText,
7501
7652
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
7653
+ templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
7502
7654
  startedAt,
7503
7655
  completedAt,
7504
7656
  llmTrace