@wix/evalforge-evaluator 0.28.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -6630,13 +6630,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6630
6630
  let toolName;
6631
6631
  let toolArgs;
6632
6632
  let outputPreview;
6633
+ let filePath;
6633
6634
  for (const block of message.message.content) {
6634
6635
  if (block.type === "tool_use") {
6635
6636
  type = import_evalforge_types.LiveTraceEventType.TOOL_USE;
6636
6637
  toolName = block.name;
6637
- toolArgs = JSON.stringify(block.input).slice(0, 200);
6638
+ toolArgs = JSON.stringify(block.input).slice(0, 500);
6639
+ const input = block.input;
6640
+ if (input.file_path || input.path || input.target_file) {
6641
+ filePath = String(input.file_path || input.path || input.target_file);
6642
+ if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
6643
+ type = import_evalforge_types.LiveTraceEventType.FILE_WRITE;
6644
+ } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
6645
+ type = import_evalforge_types.LiveTraceEventType.FILE_READ;
6646
+ }
6647
+ }
6638
6648
  } else if (block.type === "text") {
6639
- outputPreview = block.text.slice(0, 200);
6649
+ outputPreview = block.text.slice(0, 500);
6640
6650
  }
6641
6651
  }
6642
6652
  return {
@@ -6650,10 +6660,64 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6650
6660
  toolName,
6651
6661
  toolArgs,
6652
6662
  outputPreview,
6663
+ filePath,
6653
6664
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6654
6665
  isComplete
6655
6666
  };
6656
6667
  }
6668
+ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
6669
+ const baseEvent = {
6670
+ evalRunId: context.evalRunId,
6671
+ scenarioId: context.scenarioId,
6672
+ scenarioName: context.scenarioName,
6673
+ targetId: context.targetId,
6674
+ targetName: context.targetName,
6675
+ stepNumber,
6676
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6677
+ isComplete
6678
+ };
6679
+ if (isAssistantMessage(message)) {
6680
+ return createTraceEventFromMessage(
6681
+ message,
6682
+ context,
6683
+ stepNumber,
6684
+ isComplete
6685
+ );
6686
+ }
6687
+ if (message.type === "user") {
6688
+ const userMsg = message;
6689
+ let outputPreview = "";
6690
+ if (userMsg.message?.content) {
6691
+ for (const block of userMsg.message.content) {
6692
+ if ("text" in block && block.text) {
6693
+ outputPreview = block.text.slice(0, 500);
6694
+ break;
6695
+ }
6696
+ }
6697
+ }
6698
+ return {
6699
+ ...baseEvent,
6700
+ type: import_evalforge_types.LiveTraceEventType.USER,
6701
+ outputPreview: outputPreview || "(tool result)"
6702
+ };
6703
+ }
6704
+ if (message.type === "system") {
6705
+ const sysMsg = message;
6706
+ return {
6707
+ ...baseEvent,
6708
+ type: import_evalforge_types.LiveTraceEventType.SYSTEM,
6709
+ outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
6710
+ };
6711
+ }
6712
+ if (message.type === "result") {
6713
+ return null;
6714
+ }
6715
+ return {
6716
+ ...baseEvent,
6717
+ type: import_evalforge_types.LiveTraceEventType.PROGRESS,
6718
+ outputPreview: `Message type: ${message.type}`
6719
+ };
6720
+ }
6657
6721
  async function executeWithClaudeCode(skill, scenario, options) {
6658
6722
  console.log("[executeWithClaudeCode] Starting execution", {
6659
6723
  skillId: skill.id,
@@ -6850,6 +6914,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
6850
6914
  const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6851
6915
  let timeoutHandle;
6852
6916
  let timedOut = false;
6917
+ const HEARTBEAT_INTERVAL_MS = 1e4;
6918
+ let heartbeatHandle;
6919
+ const executionStartTime = Date.now();
6853
6920
  try {
6854
6921
  const timeoutPromise = new Promise((_, reject) => {
6855
6922
  timeoutHandle = setTimeout(() => {
@@ -6861,6 +6928,30 @@ async function executeWithClaudeCode(skill, scenario, options) {
6861
6928
  );
6862
6929
  }, SDK_TIMEOUT_MS);
6863
6930
  });
6931
+ if (traceContext) {
6932
+ heartbeatHandle = setInterval(() => {
6933
+ const elapsedMs = Date.now() - executionStartTime;
6934
+ const progressEvent = {
6935
+ evalRunId: traceContext.evalRunId,
6936
+ scenarioId: traceContext.scenarioId,
6937
+ scenarioName: traceContext.scenarioName,
6938
+ targetId: traceContext.targetId,
6939
+ targetName: traceContext.targetName,
6940
+ stepNumber: traceStepNumber,
6941
+ type: import_evalforge_types.LiveTraceEventType.PROGRESS,
6942
+ outputPreview: `Executing... (${Math.round(elapsedMs / 1e3)}s elapsed, ${messageCount} messages)`,
6943
+ elapsedMs,
6944
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6945
+ isComplete: false
6946
+ };
6947
+ emitTraceEvent(
6948
+ progressEvent,
6949
+ traceContext.tracePushUrl,
6950
+ traceContext.routeHeader,
6951
+ traceContext.authToken
6952
+ );
6953
+ }, HEARTBEAT_INTERVAL_MS);
6954
+ }
6864
6955
  const sdkPromise = (async () => {
6865
6956
  const evaluatorPromptSuffix = `
6866
6957
 
@@ -6883,21 +6974,23 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6883
6974
  })
6884
6975
  );
6885
6976
  }
6886
- if (traceContext && isAssistantMessage(message)) {
6977
+ if (traceContext) {
6887
6978
  traceStepNumber++;
6888
- const traceEvent = createTraceEventFromMessage(
6979
+ const traceEvent = createTraceEventFromAnyMessage(
6889
6980
  message,
6890
6981
  traceContext,
6891
6982
  traceStepNumber,
6892
6983
  false
6893
6984
  // Not complete yet
6894
6985
  );
6895
- emitTraceEvent(
6896
- traceEvent,
6897
- traceContext.tracePushUrl,
6898
- traceContext.routeHeader,
6899
- traceContext.authToken
6900
- );
6986
+ if (traceEvent) {
6987
+ emitTraceEvent(
6988
+ traceEvent,
6989
+ traceContext.tracePushUrl,
6990
+ traceContext.routeHeader,
6991
+ traceContext.authToken
6992
+ );
6993
+ }
6901
6994
  }
6902
6995
  }
6903
6996
  })();
@@ -6905,6 +6998,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6905
6998
  if (timeoutHandle) {
6906
6999
  clearTimeout(timeoutHandle);
6907
7000
  }
7001
+ if (heartbeatHandle) {
7002
+ clearInterval(heartbeatHandle);
7003
+ }
6908
7004
  console.log(
6909
7005
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6910
7006
  allMessages.length,
@@ -6914,6 +7010,9 @@ IMPORTANT: This is an automated evaluation run. Execute the requested changes im
6914
7010
  if (timeoutHandle) {
6915
7011
  clearTimeout(timeoutHandle);
6916
7012
  }
7013
+ if (heartbeatHandle) {
7014
+ clearInterval(heartbeatHandle);
7015
+ }
6917
7016
  if (timedOut) {
6918
7017
  console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6919
7018
  }
@@ -7457,6 +7556,32 @@ function diffSnapshots(before, after) {
7457
7556
  diffs.sort((a, b) => a.path.localeCompare(b.path));
7458
7557
  return diffs;
7459
7558
  }
7559
+ function extractTemplateFiles(before, after) {
7560
+ const files = [];
7561
+ const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
7562
+ for (const path9 of allPaths) {
7563
+ const beforeContent = before[path9];
7564
+ const afterContent = after[path9];
7565
+ if (afterContent === void 0) {
7566
+ continue;
7567
+ }
7568
+ let status;
7569
+ if (beforeContent === void 0) {
7570
+ status = "new";
7571
+ } else if (beforeContent !== afterContent) {
7572
+ status = "modified";
7573
+ } else {
7574
+ status = "unchanged";
7575
+ }
7576
+ files.push({
7577
+ path: path9,
7578
+ content: afterContent,
7579
+ status
7580
+ });
7581
+ }
7582
+ files.sort((a, b) => a.path.localeCompare(b.path));
7583
+ return files;
7584
+ }
7460
7585
 
7461
7586
  // src/run-scenario/callSkill.ts
7462
7587
  async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
@@ -7489,6 +7614,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7489
7614
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7490
7615
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
7491
7616
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
7617
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
7492
7618
  return {
7493
7619
  id: (0, import_crypto2.randomUUID)(),
7494
7620
  targetId: skill.id,
@@ -7499,6 +7625,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7499
7625
  duration: result.durationMs,
7500
7626
  outputText: result.outputText,
7501
7627
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
7628
+ templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
7502
7629
  startedAt,
7503
7630
  completedAt,
7504
7631
  llmTrace