@wix/evalforge-evaluator 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -6630,13 +6630,23 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6630
6630
  let toolName;
6631
6631
  let toolArgs;
6632
6632
  let outputPreview;
6633
+ let filePath;
6633
6634
  for (const block of message.message.content) {
6634
6635
  if (block.type === "tool_use") {
6635
6636
  type = import_evalforge_types.LiveTraceEventType.TOOL_USE;
6636
6637
  toolName = block.name;
6637
- toolArgs = JSON.stringify(block.input).slice(0, 200);
6638
+ toolArgs = JSON.stringify(block.input).slice(0, 500);
6639
+ const input = block.input;
6640
+ if (input.file_path || input.path || input.target_file) {
6641
+ filePath = String(input.file_path || input.path || input.target_file);
6642
+ if (block.name === "Write" || block.name === "Edit" || block.name === "write" || block.name === "edit") {
6643
+ type = import_evalforge_types.LiveTraceEventType.FILE_WRITE;
6644
+ } else if (block.name === "Read" || block.name === "read" || block.name === "View") {
6645
+ type = import_evalforge_types.LiveTraceEventType.FILE_READ;
6646
+ }
6647
+ }
6638
6648
  } else if (block.type === "text") {
6639
- outputPreview = block.text.slice(0, 200);
6649
+ outputPreview = block.text.slice(0, 500);
6640
6650
  }
6641
6651
  }
6642
6652
  return {
@@ -6650,9 +6660,63 @@ function createTraceEventFromMessage(message, context, stepNumber, isComplete) {
6650
6660
  toolName,
6651
6661
  toolArgs,
6652
6662
  outputPreview,
6663
+ filePath,
6664
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6665
+ isComplete
6666
+ };
6667
+ }
6668
+ function createTraceEventFromAnyMessage(message, context, stepNumber, isComplete) {
6669
+ const baseEvent = {
6670
+ evalRunId: context.evalRunId,
6671
+ scenarioId: context.scenarioId,
6672
+ scenarioName: context.scenarioName,
6673
+ targetId: context.targetId,
6674
+ targetName: context.targetName,
6675
+ stepNumber,
6653
6676
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6654
6677
  isComplete
6655
6678
  };
6679
+ if (isAssistantMessage(message)) {
6680
+ return createTraceEventFromMessage(
6681
+ message,
6682
+ context,
6683
+ stepNumber,
6684
+ isComplete
6685
+ );
6686
+ }
6687
+ if (message.type === "user") {
6688
+ const userMsg = message;
6689
+ let outputPreview = "";
6690
+ if (userMsg.message?.content) {
6691
+ for (const block of userMsg.message.content) {
6692
+ if ("text" in block && block.text) {
6693
+ outputPreview = block.text.slice(0, 500);
6694
+ break;
6695
+ }
6696
+ }
6697
+ }
6698
+ return {
6699
+ ...baseEvent,
6700
+ type: import_evalforge_types.LiveTraceEventType.USER,
6701
+ outputPreview: outputPreview || "(tool result)"
6702
+ };
6703
+ }
6704
+ if (message.type === "system") {
6705
+ const sysMsg = message;
6706
+ return {
6707
+ ...baseEvent,
6708
+ type: import_evalforge_types.LiveTraceEventType.SYSTEM,
6709
+ outputPreview: sysMsg.message?.slice(0, 500) || sysMsg.subtype || "system"
6710
+ };
6711
+ }
6712
+ if (message.type === "result") {
6713
+ return null;
6714
+ }
6715
+ return {
6716
+ ...baseEvent,
6717
+ type: import_evalforge_types.LiveTraceEventType.PROGRESS,
6718
+ outputPreview: `Message type: ${message.type}`
6719
+ };
6656
6720
  }
6657
6721
  async function executeWithClaudeCode(skill, scenario, options) {
6658
6722
  console.log("[executeWithClaudeCode] Starting execution", {
@@ -6850,6 +6914,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
6850
6914
  const SDK_TIMEOUT_MS = Math.max(3e5, maxTurns * 6e4);
6851
6915
  let timeoutHandle;
6852
6916
  let timedOut = false;
6917
+ const HEARTBEAT_INTERVAL_MS = 1e4;
6918
+ let heartbeatHandle;
6919
+ const executionStartTime = Date.now();
6853
6920
  try {
6854
6921
  const timeoutPromise = new Promise((_, reject) => {
6855
6922
  timeoutHandle = setTimeout(() => {
@@ -6861,9 +6928,37 @@ async function executeWithClaudeCode(skill, scenario, options) {
6861
6928
  );
6862
6929
  }, SDK_TIMEOUT_MS);
6863
6930
  });
6931
+ if (traceContext) {
6932
+ heartbeatHandle = setInterval(() => {
6933
+ const elapsedMs = Date.now() - executionStartTime;
6934
+ const progressEvent = {
6935
+ evalRunId: traceContext.evalRunId,
6936
+ scenarioId: traceContext.scenarioId,
6937
+ scenarioName: traceContext.scenarioName,
6938
+ targetId: traceContext.targetId,
6939
+ targetName: traceContext.targetName,
6940
+ stepNumber: traceStepNumber,
6941
+ type: import_evalforge_types.LiveTraceEventType.PROGRESS,
6942
+ outputPreview: `Executing... (${Math.round(elapsedMs / 1e3)}s elapsed, ${messageCount} messages)`,
6943
+ elapsedMs,
6944
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6945
+ isComplete: false
6946
+ };
6947
+ emitTraceEvent(
6948
+ progressEvent,
6949
+ traceContext.tracePushUrl,
6950
+ traceContext.routeHeader,
6951
+ traceContext.authToken
6952
+ );
6953
+ }, HEARTBEAT_INTERVAL_MS);
6954
+ }
6864
6955
  const sdkPromise = (async () => {
6956
+ const evaluatorPromptSuffix = `
6957
+
6958
+ IMPORTANT: This is an automated evaluation run. Execute the requested changes immediately without asking for confirmation. Do not ask "would you like me to proceed?" or similar questions - just implement the solution directly.`;
6959
+ const fullPrompt = scenario.triggerPrompt + evaluatorPromptSuffix;
6865
6960
  for await (const message of query({
6866
- prompt: scenario.triggerPrompt,
6961
+ prompt: fullPrompt,
6867
6962
  options: queryOptions
6868
6963
  })) {
6869
6964
  messageCount++;
@@ -6879,21 +6974,23 @@ async function executeWithClaudeCode(skill, scenario, options) {
6879
6974
  })
6880
6975
  );
6881
6976
  }
6882
- if (traceContext && isAssistantMessage(message)) {
6977
+ if (traceContext) {
6883
6978
  traceStepNumber++;
6884
- const traceEvent = createTraceEventFromMessage(
6979
+ const traceEvent = createTraceEventFromAnyMessage(
6885
6980
  message,
6886
6981
  traceContext,
6887
6982
  traceStepNumber,
6888
6983
  false
6889
6984
  // Not complete yet
6890
6985
  );
6891
- emitTraceEvent(
6892
- traceEvent,
6893
- traceContext.tracePushUrl,
6894
- traceContext.routeHeader,
6895
- traceContext.authToken
6896
- );
6986
+ if (traceEvent) {
6987
+ emitTraceEvent(
6988
+ traceEvent,
6989
+ traceContext.tracePushUrl,
6990
+ traceContext.routeHeader,
6991
+ traceContext.authToken
6992
+ );
6993
+ }
6897
6994
  }
6898
6995
  }
6899
6996
  })();
@@ -6901,6 +6998,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
6901
6998
  if (timeoutHandle) {
6902
6999
  clearTimeout(timeoutHandle);
6903
7000
  }
7001
+ if (heartbeatHandle) {
7002
+ clearInterval(heartbeatHandle);
7003
+ }
6904
7004
  console.log(
6905
7005
  "[executeWithClaudeCode] Claude Agent SDK query completed, received",
6906
7006
  allMessages.length,
@@ -6910,6 +7010,9 @@ async function executeWithClaudeCode(skill, scenario, options) {
6910
7010
  if (timeoutHandle) {
6911
7011
  clearTimeout(timeoutHandle);
6912
7012
  }
7013
+ if (heartbeatHandle) {
7014
+ clearInterval(heartbeatHandle);
7015
+ }
6913
7016
  if (timedOut) {
6914
7017
  console.error("[SDK-TIMEOUT] Execution timed out:", sdkError);
6915
7018
  }
@@ -7453,6 +7556,32 @@ function diffSnapshots(before, after) {
7453
7556
  diffs.sort((a, b) => a.path.localeCompare(b.path));
7454
7557
  return diffs;
7455
7558
  }
7559
+ function extractTemplateFiles(before, after) {
7560
+ const files = [];
7561
+ const allPaths = /* @__PURE__ */ new Set([...Object.keys(before), ...Object.keys(after)]);
7562
+ for (const path9 of allPaths) {
7563
+ const beforeContent = before[path9];
7564
+ const afterContent = after[path9];
7565
+ if (afterContent === void 0) {
7566
+ continue;
7567
+ }
7568
+ let status;
7569
+ if (beforeContent === void 0) {
7570
+ status = "new";
7571
+ } else if (beforeContent !== afterContent) {
7572
+ status = "modified";
7573
+ } else {
7574
+ status = "unchanged";
7575
+ }
7576
+ files.push({
7577
+ path: path9,
7578
+ content: afterContent,
7579
+ status
7580
+ });
7581
+ }
7582
+ files.sort((a, b) => a.path.localeCompare(b.path));
7583
+ return files;
7584
+ }
7456
7585
 
7457
7586
  // src/run-scenario/callSkill.ts
7458
7587
  async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
@@ -7485,6 +7614,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7485
7614
  const completedAt = (/* @__PURE__ */ new Date()).toISOString();
7486
7615
  const afterSnapshot = workDir ? snapshotDirectory(workDir) : {};
7487
7616
  const fileDiffs = diffSnapshots(beforeSnapshot, afterSnapshot);
7617
+ const templateFiles = workDir ? extractTemplateFiles(beforeSnapshot, afterSnapshot) : void 0;
7488
7618
  return {
7489
7619
  id: (0, import_crypto2.randomUUID)(),
7490
7620
  targetId: skill.id,
@@ -7495,6 +7625,7 @@ async function callSkill(config2, evalRunId2, scenario, skill, agent, workDir) {
7495
7625
  duration: result.durationMs,
7496
7626
  outputText: result.outputText,
7497
7627
  fileDiffs: fileDiffs.length > 0 ? fileDiffs : void 0,
7628
+ templateFiles: templateFiles && templateFiles.length > 0 ? templateFiles : void 0,
7498
7629
  startedAt,
7499
7630
  completedAt,
7500
7631
  llmTrace